1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
6 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
7 ; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
9 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
11 define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
12 ; GFX9_W64-LABEL: mubuf_vgpr:
14 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15 ; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec
16 ; GFX9_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
17 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0
18 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1
19 ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v2
20 ; GFX9_W64-NEXT: v_readfirstlane_b32 s11, v3
21 ; GFX9_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
22 ; GFX9_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
23 ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
24 ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
25 ; GFX9_W64-NEXT: s_nop 0
26 ; GFX9_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen
27 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
28 ; GFX9_W64-NEXT: ; implicit-def: $vgpr4
29 ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5]
30 ; GFX9_W64-NEXT: s_cbranch_execnz .LBB0_1
31 ; GFX9_W64-NEXT: ; %bb.2:
32 ; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7]
33 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0)
34 ; GFX9_W64-NEXT: v_mov_b32_e32 v0, v5
35 ; GFX9_W64-NEXT: s_setpc_b64 s[30:31]
37 ; GFX1010_W32-LABEL: mubuf_vgpr:
38 ; GFX1010_W32: ; %bb.0:
39 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40 ; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo
41 ; GFX1010_W32-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
42 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0
43 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1
44 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v2
45 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s11, v3
46 ; GFX1010_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
47 ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
48 ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4
49 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4
50 ; GFX1010_W32-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen
51 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
52 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4
53 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3
54 ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4
55 ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB0_1
56 ; GFX1010_W32-NEXT: ; %bb.2:
57 ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5
58 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0)
59 ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, v5
60 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31]
62 ; GFX1010_W64-LABEL: mubuf_vgpr:
63 ; GFX1010_W64: ; %bb.0:
64 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65 ; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec
66 ; GFX1010_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
67 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0
68 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1
69 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v2
70 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s11, v3
71 ; GFX1010_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
72 ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
73 ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
74 ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
75 ; GFX1010_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen
76 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
77 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4
78 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3
79 ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5]
80 ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB0_1
81 ; GFX1010_W64-NEXT: ; %bb.2:
82 ; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7]
83 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0)
84 ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, v5
85 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31]
87 ; GFX1100_W32-LABEL: mubuf_vgpr:
88 ; GFX1100_W32: ; %bb.0:
89 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90 ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo
91 ; GFX1100_W32-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
92 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v0
93 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v1
94 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s6, v2
95 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s7, v3
96 ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
97 ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
98 ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
99 ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
100 ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
101 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
102 ; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen
103 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
104 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4
105 ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0
106 ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB0_1
107 ; GFX1100_W32-NEXT: ; %bb.2:
108 ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1
109 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0)
110 ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, v5
111 ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31]
113 ; GFX1100_W64-LABEL: mubuf_vgpr:
114 ; GFX1100_W64: ; %bb.0:
115 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec
117 ; GFX1100_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
118 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v0
119 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v1
120 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s6, v2
121 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s7, v3
122 ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
123 ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
124 ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
125 ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
126 ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
127 ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
128 ; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen
129 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
130 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4
131 ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1]
132 ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB0_1
133 ; GFX1100_W64-NEXT: ; %bb.2:
134 ; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3]
135 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0)
136 ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, v5
137 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31]
139 ; W64-O0-LABEL: mubuf_vgpr:
141 ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
143 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
144 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
145 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5]
146 ; W64-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
147 ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
148 ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
149 ; W64-O0-NEXT: v_mov_b32_e32 v6, v2
150 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
151 ; W64-O0-NEXT: v_mov_b32_e32 v3, v1
152 ; W64-O0-NEXT: v_mov_b32_e32 v1, v0
153 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
154 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
155 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
156 ; W64-O0-NEXT: ; implicit-def: $sgpr4
157 ; W64-O0-NEXT: ; implicit-def: $sgpr4
158 ; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
159 ; W64-O0-NEXT: s_waitcnt vmcnt(1)
160 ; W64-O0-NEXT: v_mov_b32_e32 v7, v2
161 ; W64-O0-NEXT: v_mov_b32_e32 v5, v7
162 ; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
163 ; W64-O0-NEXT: ; implicit-def: $sgpr4
164 ; W64-O0-NEXT: ; implicit-def: $sgpr4
165 ; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
166 ; W64-O0-NEXT: v_mov_b32_e32 v2, v3
167 ; W64-O0-NEXT: v_mov_b32_e32 v7, v2
168 ; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
169 ; W64-O0-NEXT: ; implicit-def: $sgpr4
170 ; W64-O0-NEXT: ; implicit-def: $sgpr4
171 ; W64-O0-NEXT: ; implicit-def: $sgpr4
172 ; W64-O0-NEXT: ; implicit-def: $sgpr4
173 ; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
174 ; W64-O0-NEXT: v_mov_b32_e32 v2, v7
175 ; W64-O0-NEXT: v_mov_b32_e32 v3, v6
176 ; W64-O0-NEXT: v_mov_b32_e32 v4, v5
177 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
178 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
179 ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
180 ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
181 ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
182 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
183 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
184 ; W64-O0-NEXT: s_mov_b32 s4, 0
185 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 0
186 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec
187 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 1
188 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 2
189 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
190 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
191 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
192 ; W64-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
193 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
194 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
195 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
196 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
197 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
198 ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
199 ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
200 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
201 ; W64-O0-NEXT: v_readfirstlane_b32 s8, v1
202 ; W64-O0-NEXT: v_readfirstlane_b32 s12, v2
203 ; W64-O0-NEXT: s_mov_b32 s4, s8
204 ; W64-O0-NEXT: s_mov_b32 s5, s12
205 ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
206 ; W64-O0-NEXT: v_readfirstlane_b32 s7, v3
207 ; W64-O0-NEXT: v_readfirstlane_b32 s6, v4
208 ; W64-O0-NEXT: s_mov_b32 s10, s7
209 ; W64-O0-NEXT: s_mov_b32 s11, s6
210 ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
211 ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11]
212 ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
213 ; W64-O0-NEXT: s_mov_b32 s9, s12
214 ; W64-O0-NEXT: s_mov_b32 s10, s7
215 ; W64-O0-NEXT: s_mov_b32 s11, s6
216 ; W64-O0-NEXT: v_writelane_b32 v0, s8, 3
217 ; W64-O0-NEXT: v_writelane_b32 v0, s9, 4
218 ; W64-O0-NEXT: v_writelane_b32 v0, s10, 5
219 ; W64-O0-NEXT: v_writelane_b32 v0, s11, 6
220 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
221 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 7
222 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 8
223 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
224 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
225 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
226 ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
227 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
228 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
229 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
230 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
231 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 7
232 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 8
233 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 3
234 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 4
235 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 5
236 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 6
237 ; W64-O0-NEXT: v_readlane_b32 s6, v1, 0
238 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
239 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
240 ; W64-O0-NEXT: s_nop 2
241 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen
242 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
243 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
244 ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
245 ; W64-O0-NEXT: s_cbranch_execnz .LBB0_1
246 ; W64-O0-NEXT: ; %bb.3:
247 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
248 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
249 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
250 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
251 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 1
252 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 2
253 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5]
254 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
255 ; W64-O0-NEXT: ; kill: killed $vgpr1
256 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
257 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
258 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
259 ; W64-O0-NEXT: s_nop 0
260 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
261 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5]
262 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
263 ; W64-O0-NEXT: s_setpc_b64 s[30:31]
264 %call = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %i, i32 %c, i32 0, i32 0, i32 0) #1
270 ; FIXME: redundant s_mov
272 define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) %j, i32 %c, ptr addrspace(1) %out0, ptr addrspace(1) %out1) #0 {
273 ; GFX9_W64-LABEL: mubuf_vgpr_adjacent_in_block:
274 ; GFX9_W64: ; %bb.0: ; %entry
275 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276 ; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec
277 ; GFX9_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
278 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0
279 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1
280 ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v2
281 ; GFX9_W64-NEXT: v_readfirstlane_b32 s11, v3
282 ; GFX9_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
283 ; GFX9_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
284 ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
285 ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
286 ; GFX9_W64-NEXT: s_nop 0
287 ; GFX9_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen
288 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
289 ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5]
290 ; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_1
291 ; GFX9_W64-NEXT: ; %bb.2:
292 ; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7]
293 ; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec
294 ; GFX9_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
295 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v4
296 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v5
297 ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v6
298 ; GFX9_W64-NEXT: v_readfirstlane_b32 s11, v7
299 ; GFX9_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5]
300 ; GFX9_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7]
301 ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
302 ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
303 ; GFX9_W64-NEXT: s_nop 0
304 ; GFX9_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen
305 ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
306 ; GFX9_W64-NEXT: ; implicit-def: $vgpr8
307 ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5]
308 ; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_3
309 ; GFX9_W64-NEXT: ; %bb.4:
310 ; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7]
311 ; GFX9_W64-NEXT: s_waitcnt vmcnt(1)
312 ; GFX9_W64-NEXT: global_store_dword v[9:10], v13, off
313 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0)
314 ; GFX9_W64-NEXT: global_store_dword v[11:12], v0, off
315 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0)
316 ; GFX9_W64-NEXT: s_setpc_b64 s[30:31]
318 ; GFX1010_W32-LABEL: mubuf_vgpr_adjacent_in_block:
319 ; GFX1010_W32: ; %bb.0: ; %entry
320 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
321 ; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo
322 ; GFX1010_W32-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
323 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0
324 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1
325 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v2
326 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s11, v3
327 ; GFX1010_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
328 ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
329 ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4
330 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4
331 ; GFX1010_W32-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen
332 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
333 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3
334 ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4
335 ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_1
336 ; GFX1010_W32-NEXT: ; %bb.2:
337 ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5
338 ; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo
339 ; GFX1010_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
340 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v4
341 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v5
342 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v6
343 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s11, v7
344 ; GFX1010_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5]
345 ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[6:7]
346 ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4
347 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4
348 ; GFX1010_W32-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen
349 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
350 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8
351 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3
352 ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4
353 ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_3
354 ; GFX1010_W32-NEXT: ; %bb.4:
355 ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5
356 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(1)
357 ; GFX1010_W32-NEXT: global_store_dword v[9:10], v13, off
358 ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0
359 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0)
360 ; GFX1010_W32-NEXT: global_store_dword v[11:12], v0, off
361 ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0
362 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31]
364 ; GFX1010_W64-LABEL: mubuf_vgpr_adjacent_in_block:
365 ; GFX1010_W64: ; %bb.0: ; %entry
366 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367 ; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec
368 ; GFX1010_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
369 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0
370 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1
371 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v2
372 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s11, v3
373 ; GFX1010_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
374 ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
375 ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
376 ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
377 ; GFX1010_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen
378 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
379 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3
380 ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5]
381 ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_1
382 ; GFX1010_W64-NEXT: ; %bb.2:
383 ; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7]
384 ; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec
385 ; GFX1010_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
386 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v4
387 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v5
388 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v6
389 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s11, v7
390 ; GFX1010_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5]
391 ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7]
392 ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
393 ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
394 ; GFX1010_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen
395 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
396 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8
397 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3
398 ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5]
399 ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_3
400 ; GFX1010_W64-NEXT: ; %bb.4:
401 ; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7]
402 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(1)
403 ; GFX1010_W64-NEXT: global_store_dword v[9:10], v13, off
404 ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0
405 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0)
406 ; GFX1010_W64-NEXT: global_store_dword v[11:12], v0, off
407 ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0
408 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31]
410 ; GFX1100_W32-LABEL: mubuf_vgpr_adjacent_in_block:
411 ; GFX1100_W32: ; %bb.0: ; %entry
412 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413 ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo
414 ; GFX1100_W32-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
415 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v0
416 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v1
417 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s6, v2
418 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s7, v3
419 ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
420 ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
421 ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
422 ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
423 ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
424 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
425 ; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen
426 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
427 ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0
428 ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_1
429 ; GFX1100_W32-NEXT: ; %bb.2:
430 ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1
431 ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
432 ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo
433 ; GFX1100_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
434 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v4
435 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v5
436 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s6, v6
437 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s7, v7
438 ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
439 ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
440 ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
441 ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
442 ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
443 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
444 ; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen
445 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
446 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8
447 ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0
448 ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_3
449 ; GFX1100_W32-NEXT: ; %bb.4:
450 ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1
451 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(1)
452 ; GFX1100_W32-NEXT: global_store_b32 v[9:10], v13, off dlc
453 ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0
454 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0)
455 ; GFX1100_W32-NEXT: global_store_b32 v[11:12], v0, off dlc
456 ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0
457 ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31]
459 ; GFX1100_W64-LABEL: mubuf_vgpr_adjacent_in_block:
460 ; GFX1100_W64: ; %bb.0: ; %entry
461 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
462 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec
463 ; GFX1100_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
464 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v0
465 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v1
466 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s6, v2
467 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s7, v3
468 ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
469 ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
470 ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
471 ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
472 ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
473 ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
474 ; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen
475 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
476 ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1]
477 ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_1
478 ; GFX1100_W64-NEXT: ; %bb.2:
479 ; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3]
480 ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
481 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec
482 ; GFX1100_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
483 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v4
484 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v5
485 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s6, v6
486 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s7, v7
487 ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
488 ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
489 ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
490 ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
491 ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
492 ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
493 ; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen
494 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
495 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8
496 ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1]
497 ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_3
498 ; GFX1100_W64-NEXT: ; %bb.4:
499 ; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3]
500 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(1)
501 ; GFX1100_W64-NEXT: global_store_b32 v[9:10], v13, off dlc
502 ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0
503 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0)
504 ; GFX1100_W64-NEXT: global_store_b32 v[11:12], v0, off dlc
505 ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0
506 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31]
508 ; W64-O0-LABEL: mubuf_vgpr_adjacent_in_block:
509 ; W64-O0: ; %bb.0: ; %entry
510 ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
511 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
512 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
513 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
514 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5]
515 ; W64-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane
516 ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
517 ; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
518 ; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
519 ; W64-O0-NEXT: v_mov_b32_e32 v14, v6
520 ; W64-O0-NEXT: v_mov_b32_e32 v9, v5
521 ; W64-O0-NEXT: v_mov_b32_e32 v13, v4
522 ; W64-O0-NEXT: v_mov_b32_e32 v4, v3
523 ; W64-O0-NEXT: v_mov_b32_e32 v8, v2
524 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
525 ; W64-O0-NEXT: v_mov_b32_e32 v5, v1
526 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
527 ; W64-O0-NEXT: v_mov_b32_e32 v3, v0
528 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
529 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
530 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
531 ; W64-O0-NEXT: ; implicit-def: $sgpr4
532 ; W64-O0-NEXT: ; implicit-def: $sgpr4
533 ; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
534 ; W64-O0-NEXT: v_mov_b32_e32 v15, v7
535 ; W64-O0-NEXT: v_mov_b32_e32 v6, v15
536 ; W64-O0-NEXT: v_mov_b32_e32 v7, v14
537 ; W64-O0-NEXT: ; implicit-def: $sgpr4
538 ; W64-O0-NEXT: ; implicit-def: $sgpr4
539 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
540 ; W64-O0-NEXT: v_mov_b32_e32 v14, v9
541 ; W64-O0-NEXT: v_mov_b32_e32 v9, v14
542 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec
543 ; W64-O0-NEXT: ; implicit-def: $sgpr4
544 ; W64-O0-NEXT: ; implicit-def: $sgpr4
545 ; W64-O0-NEXT: ; implicit-def: $sgpr4
546 ; W64-O0-NEXT: ; implicit-def: $sgpr4
547 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
548 ; W64-O0-NEXT: v_mov_b32_e32 v14, v9
549 ; W64-O0-NEXT: v_mov_b32_e32 v15, v7
550 ; W64-O0-NEXT: v_mov_b32_e32 v16, v6
551 ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
552 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
553 ; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
554 ; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
555 ; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
556 ; W64-O0-NEXT: ; implicit-def: $sgpr4
557 ; W64-O0-NEXT: ; implicit-def: $sgpr4
558 ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
559 ; W64-O0-NEXT: v_mov_b32_e32 v9, v4
560 ; W64-O0-NEXT: v_mov_b32_e32 v7, v9
561 ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
562 ; W64-O0-NEXT: ; implicit-def: $sgpr4
563 ; W64-O0-NEXT: ; implicit-def: $sgpr4
564 ; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
565 ; W64-O0-NEXT: v_mov_b32_e32 v4, v5
566 ; W64-O0-NEXT: v_mov_b32_e32 v9, v4
567 ; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
568 ; W64-O0-NEXT: ; implicit-def: $sgpr4
569 ; W64-O0-NEXT: ; implicit-def: $sgpr4
570 ; W64-O0-NEXT: ; implicit-def: $sgpr4
571 ; W64-O0-NEXT: ; implicit-def: $sgpr4
572 ; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec
573 ; W64-O0-NEXT: v_mov_b32_e32 v4, v9
574 ; W64-O0-NEXT: v_mov_b32_e32 v5, v8
575 ; W64-O0-NEXT: v_mov_b32_e32 v6, v7
576 ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
577 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
578 ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
579 ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
580 ; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
581 ; W64-O0-NEXT: ; implicit-def: $sgpr4
582 ; W64-O0-NEXT: ; implicit-def: $sgpr4
583 ; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
584 ; W64-O0-NEXT: v_mov_b32_e32 v3, v12
585 ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
586 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
587 ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
588 ; W64-O0-NEXT: ; implicit-def: $sgpr4
589 ; W64-O0-NEXT: ; implicit-def: $sgpr4
590 ; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
591 ; W64-O0-NEXT: v_mov_b32_e32 v2, v10
592 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
593 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
594 ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
595 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
596 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
597 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
598 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
599 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
600 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
601 ; W64-O0-NEXT: s_mov_b32 s4, 0
602 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 0
603 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec
604 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 1
605 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 2
606 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
607 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
608 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
609 ; W64-O0-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
610 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
611 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
612 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
613 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
614 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
615 ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
616 ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
617 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
618 ; W64-O0-NEXT: v_readfirstlane_b32 s8, v1
619 ; W64-O0-NEXT: v_readfirstlane_b32 s12, v2
620 ; W64-O0-NEXT: s_mov_b32 s4, s8
621 ; W64-O0-NEXT: s_mov_b32 s5, s12
622 ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
623 ; W64-O0-NEXT: v_readfirstlane_b32 s7, v3
624 ; W64-O0-NEXT: v_readfirstlane_b32 s6, v4
625 ; W64-O0-NEXT: s_mov_b32 s10, s7
626 ; W64-O0-NEXT: s_mov_b32 s11, s6
627 ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
628 ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11]
629 ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
630 ; W64-O0-NEXT: s_mov_b32 s9, s12
631 ; W64-O0-NEXT: s_mov_b32 s10, s7
632 ; W64-O0-NEXT: s_mov_b32 s11, s6
633 ; W64-O0-NEXT: v_writelane_b32 v0, s8, 3
634 ; W64-O0-NEXT: v_writelane_b32 v0, s9, 4
635 ; W64-O0-NEXT: v_writelane_b32 v0, s10, 5
636 ; W64-O0-NEXT: v_writelane_b32 v0, s11, 6
637 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
638 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 7
639 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 8
640 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
641 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
642 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
643 ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
644 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
645 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
646 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
647 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
648 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 7
649 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 8
650 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 3
651 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 4
652 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 5
653 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 6
654 ; W64-O0-NEXT: v_readlane_b32 s6, v1, 0
655 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
656 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
657 ; W64-O0-NEXT: s_nop 2
658 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen
659 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
660 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
661 ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
662 ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1
663 ; W64-O0-NEXT: ; %bb.3:
664 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
665 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
666 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
667 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
668 ; W64-O0-NEXT: v_readlane_b32 s4, v0, 1
669 ; W64-O0-NEXT: v_readlane_b32 s5, v0, 2
670 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5]
671 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec
672 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 9
673 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 10
674 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
675 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
676 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
677 ; W64-O0-NEXT: .LBB1_4: ; =>This Inner Loop Header: Depth=1
678 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
679 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
680 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
681 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
682 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
683 ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
684 ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
685 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
686 ; W64-O0-NEXT: v_readfirstlane_b32 s8, v1
687 ; W64-O0-NEXT: v_readfirstlane_b32 s12, v2
688 ; W64-O0-NEXT: s_mov_b32 s4, s8
689 ; W64-O0-NEXT: s_mov_b32 s5, s12
690 ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
691 ; W64-O0-NEXT: v_readfirstlane_b32 s7, v3
692 ; W64-O0-NEXT: v_readfirstlane_b32 s6, v4
693 ; W64-O0-NEXT: s_mov_b32 s10, s7
694 ; W64-O0-NEXT: s_mov_b32 s11, s6
695 ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
696 ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11]
697 ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
698 ; W64-O0-NEXT: s_mov_b32 s9, s12
699 ; W64-O0-NEXT: s_mov_b32 s10, s7
700 ; W64-O0-NEXT: s_mov_b32 s11, s6
701 ; W64-O0-NEXT: v_writelane_b32 v0, s8, 11
702 ; W64-O0-NEXT: v_writelane_b32 v0, s9, 12
703 ; W64-O0-NEXT: v_writelane_b32 v0, s10, 13
704 ; W64-O0-NEXT: v_writelane_b32 v0, s11, 14
705 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
706 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 15
707 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 16
708 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
709 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
710 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
711 ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
712 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
713 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
714 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
715 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
716 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 15
717 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 16
718 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 11
719 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 12
720 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 13
721 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 14
722 ; W64-O0-NEXT: v_readlane_b32 s6, v1, 0
723 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
724 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
725 ; W64-O0-NEXT: s_nop 2
726 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen
727 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
728 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
729 ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
730 ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4
731 ; W64-O0-NEXT: ; %bb.6:
732 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
733 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
734 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
735 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
736 ; W64-O0-NEXT: v_readlane_b32 s4, v0, 9
737 ; W64-O0-NEXT: v_readlane_b32 s5, v0, 10
738 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5]
739 ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
740 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
741 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
742 ; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
743 ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
744 ; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
745 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
746 ; W64-O0-NEXT: global_store_dword v[4:5], v6, off
747 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
748 ; W64-O0-NEXT: global_store_dword v[1:2], v3, off
749 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
750 ; W64-O0-NEXT: ; kill: killed $vgpr0
751 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
752 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
753 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
754 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5]
755 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
756 ; W64-O0-NEXT: s_setpc_b64 s[30:31]
758 %val0 = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %i, i32 %c, i32 0, i32 0, i32 0) #1
759 %val1 = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %j, i32 %c, i32 0, i32 0, i32 0) #1
760 store volatile float %val0, ptr addrspace(1) %out0
761 store volatile float %val1, ptr addrspace(1) %out1
765 ; Confirm spills do not occur between the XOR and branch that terminate the
766 ; waterfall loop BBs.
768 define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, i32 %c, ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
769 ; GFX9_W64-LABEL: mubuf_vgpr_outside_entry:
770 ; GFX9_W64: ; %bb.0: ; %entry
771 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
772 ; GFX9_W64-NEXT: ;;#ASMSTART
773 ; GFX9_W64-NEXT: s_mov_b32 s4, 17
774 ; GFX9_W64-NEXT: ;;#ASMEND
775 ; GFX9_W64-NEXT: v_mov_b32_e32 v8, s4
776 ; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec
777 ; GFX9_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
778 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0
779 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1
780 ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v2
781 ; GFX9_W64-NEXT: v_readfirstlane_b32 s11, v3
782 ; GFX9_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
783 ; GFX9_W64-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3]
784 ; GFX9_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
785 ; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
786 ; GFX9_W64-NEXT: s_nop 0
787 ; GFX9_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
788 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
789 ; GFX9_W64-NEXT: ; implicit-def: $vgpr8
790 ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[6:7]
791 ; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_1
792 ; GFX9_W64-NEXT: ; %bb.2:
793 ; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13]
794 ; GFX9_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31
795 ; GFX9_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
796 ; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc
797 ; GFX9_W64-NEXT: s_cbranch_execz .LBB2_6
798 ; GFX9_W64-NEXT: ; %bb.3: ; %bb1
799 ; GFX9_W64-NEXT: v_mov_b32_e32 v0, s4
800 ; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec
801 ; GFX9_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1
802 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v4
803 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v5
804 ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v6
805 ; GFX9_W64-NEXT: v_readfirstlane_b32 s11, v7
806 ; GFX9_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5]
807 ; GFX9_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7]
808 ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
809 ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
810 ; GFX9_W64-NEXT: s_nop 0
811 ; GFX9_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen
812 ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
813 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0
814 ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5]
815 ; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_4
816 ; GFX9_W64-NEXT: ; %bb.5:
817 ; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13]
818 ; GFX9_W64-NEXT: .LBB2_6: ; %bb2
819 ; GFX9_W64-NEXT: s_or_b64 exec, exec, s[6:7]
820 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0)
821 ; GFX9_W64-NEXT: global_store_dword v[11:12], v9, off
822 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0)
823 ; GFX9_W64-NEXT: s_setpc_b64 s[30:31]
825 ; GFX1010_W32-LABEL: mubuf_vgpr_outside_entry:
826 ; GFX1010_W32: ; %bb.0: ; %entry
827 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
828 ; GFX1010_W32-NEXT: ;;#ASMSTART
829 ; GFX1010_W32-NEXT: s_mov_b32 s4, 17
830 ; GFX1010_W32-NEXT: ;;#ASMEND
831 ; GFX1010_W32-NEXT: v_mov_b32_e32 v8, s4
832 ; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo
833 ; GFX1010_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
834 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0
835 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1
836 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v2
837 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s11, v3
838 ; GFX1010_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
839 ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[2:3]
840 ; GFX1010_W32-NEXT: s_and_b32 s5, vcc_lo, s5
841 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, s5
842 ; GFX1010_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
843 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
844 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8
845 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3
846 ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s5
847 ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_1
848 ; GFX1010_W32-NEXT: ; %bb.2:
849 ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6
850 ; GFX1010_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31
851 ; GFX1010_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
852 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, vcc_lo
853 ; GFX1010_W32-NEXT: s_cbranch_execz .LBB2_6
854 ; GFX1010_W32-NEXT: ; %bb.3: ; %bb1
855 ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, s4
856 ; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo
857 ; GFX1010_W32-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1
858 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v4
859 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v5
860 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v6
861 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s11, v7
862 ; GFX1010_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5]
863 ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[6:7]
864 ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4
865 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4
866 ; GFX1010_W32-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen
867 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
868 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0
869 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3
870 ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4
871 ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_4
872 ; GFX1010_W32-NEXT: ; %bb.5:
873 ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6
874 ; GFX1010_W32-NEXT: .LBB2_6: ; %bb2
875 ; GFX1010_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
876 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0)
877 ; GFX1010_W32-NEXT: global_store_dword v[11:12], v9, off
878 ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0
879 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31]
881 ; GFX1010_W64-LABEL: mubuf_vgpr_outside_entry:
882 ; GFX1010_W64: ; %bb.0: ; %entry
883 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
884 ; GFX1010_W64-NEXT: ;;#ASMSTART
885 ; GFX1010_W64-NEXT: s_mov_b32 s4, 17
886 ; GFX1010_W64-NEXT: ;;#ASMEND
887 ; GFX1010_W64-NEXT: v_mov_b32_e32 v8, s4
888 ; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec
889 ; GFX1010_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
890 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0
891 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1
892 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v2
893 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s11, v3
894 ; GFX1010_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
895 ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3]
896 ; GFX1010_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
897 ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
898 ; GFX1010_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
899 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
900 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8
901 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3
902 ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[6:7]
903 ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_1
904 ; GFX1010_W64-NEXT: ; %bb.2:
905 ; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13]
906 ; GFX1010_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31
907 ; GFX1010_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
908 ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc
909 ; GFX1010_W64-NEXT: s_cbranch_execz .LBB2_6
910 ; GFX1010_W64-NEXT: ; %bb.3: ; %bb1
911 ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, s4
912 ; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec
913 ; GFX1010_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1
914 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v4
915 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v5
916 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v6
917 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s11, v7
918 ; GFX1010_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5]
919 ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7]
920 ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
921 ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
922 ; GFX1010_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen
923 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
924 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0
925 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3
926 ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5]
927 ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_4
928 ; GFX1010_W64-NEXT: ; %bb.5:
929 ; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13]
930 ; GFX1010_W64-NEXT: .LBB2_6: ; %bb2
931 ; GFX1010_W64-NEXT: s_or_b64 exec, exec, s[6:7]
932 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0)
933 ; GFX1010_W64-NEXT: global_store_dword v[11:12], v9, off
934 ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0
935 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31]
937 ; GFX1100_W32-LABEL: mubuf_vgpr_outside_entry:
938 ; GFX1100_W32: ; %bb.0: ; %entry
939 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
940 ; GFX1100_W32-NEXT: ;;#ASMSTART
941 ; GFX1100_W32-NEXT: s_mov_b32 s4, 17
942 ; GFX1100_W32-NEXT: ;;#ASMEND
943 ; GFX1100_W32-NEXT: v_mov_b32_e32 v8, s4
944 ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo
945 ; GFX1100_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
946 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s8, v0
947 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s9, v1
948 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s10, v2
949 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s11, v3
950 ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
951 ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
952 ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[10:11], v[2:3]
953 ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
954 ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
955 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
956 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
957 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
958 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8
959 ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0
960 ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_1
961 ; GFX1100_W32-NEXT: ; %bb.2:
962 ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1
963 ; GFX1100_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31
964 ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo
965 ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
966 ; GFX1100_W32-NEXT: v_cmpx_eq_u32_e32 0, v0
967 ; GFX1100_W32-NEXT: s_cbranch_execz .LBB2_6
968 ; GFX1100_W32-NEXT: ; %bb.3: ; %bb1
969 ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, s4
970 ; GFX1100_W32-NEXT: s_mov_b32 s2, exec_lo
971 ; GFX1100_W32-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1
972 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v4
973 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v5
974 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s6, v6
975 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s7, v7
976 ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
977 ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
978 ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
979 ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
980 ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
981 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
982 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen
983 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
984 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0
985 ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0
986 ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_4
987 ; GFX1100_W32-NEXT: ; %bb.5:
988 ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s2
989 ; GFX1100_W32-NEXT: .LBB2_6: ; %bb2
990 ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
991 ; GFX1100_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
992 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0)
993 ; GFX1100_W32-NEXT: global_store_b32 v[11:12], v9, off dlc
994 ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0
995 ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31]
997 ; GFX1100_W64-LABEL: mubuf_vgpr_outside_entry:
998 ; GFX1100_W64: ; %bb.0: ; %entry
999 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1000 ; GFX1100_W64-NEXT: ;;#ASMSTART
1001 ; GFX1100_W64-NEXT: s_mov_b32 s4, 17
1002 ; GFX1100_W64-NEXT: ;;#ASMEND
1003 ; GFX1100_W64-NEXT: v_mov_b32_e32 v8, s4
1004 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec
1005 ; GFX1100_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
1006 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s8, v0
1007 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s9, v1
1008 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s10, v2
1009 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s11, v3
1010 ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1011 ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
1012 ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
1013 ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1014 ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
1015 ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
1016 ; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
1017 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1018 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8
1019 ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1]
1020 ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_1
1021 ; GFX1100_W64-NEXT: ; %bb.2:
1022 ; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3]
1023 ; GFX1100_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31
1024 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec
1025 ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
1026 ; GFX1100_W64-NEXT: v_cmpx_eq_u32_e32 0, v0
1027 ; GFX1100_W64-NEXT: s_cbranch_execz .LBB2_6
1028 ; GFX1100_W64-NEXT: ; %bb.3: ; %bb1
1029 ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, s4
1030 ; GFX1100_W64-NEXT: s_mov_b64 s[8:9], exec
1031 ; GFX1100_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1
1032 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v4
1033 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v5
1034 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s6, v6
1035 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s7, v7
1036 ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1037 ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
1038 ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
1039 ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1040 ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
1041 ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
1042 ; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen
1043 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
1044 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0
1045 ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1]
1046 ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_4
1047 ; GFX1100_W64-NEXT: ; %bb.5:
1048 ; GFX1100_W64-NEXT: s_mov_b64 exec, s[8:9]
1049 ; GFX1100_W64-NEXT: .LBB2_6: ; %bb2
1050 ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1051 ; GFX1100_W64-NEXT: s_or_b64 exec, exec, s[2:3]
1052 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0)
1053 ; GFX1100_W64-NEXT: global_store_b32 v[11:12], v9, off dlc
1054 ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0
1055 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31]
1057 ; W64-O0-LABEL: mubuf_vgpr_outside_entry:
1058 ; W64-O0: ; %bb.0: ; %entry
1059 ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1060 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
1061 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
1062 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
1063 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5]
1064 ; W64-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
1065 ; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
1066 ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
1067 ; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
1068 ; W64-O0-NEXT: v_mov_b32_e32 v6, v5
1069 ; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
1070 ; W64-O0-NEXT: s_nop 0
1071 ; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
1072 ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
1073 ; W64-O0-NEXT: v_mov_b32_e32 v4, v3
1074 ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
1075 ; W64-O0-NEXT: v_mov_b32_e32 v13, v2
1076 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
1077 ; W64-O0-NEXT: v_mov_b32_e32 v10, v1
1078 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1079 ; W64-O0-NEXT: v_mov_b32_e32 v8, v0
1080 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1081 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1082 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1083 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1084 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1085 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
1086 ; W64-O0-NEXT: v_mov_b32_e32 v14, v4
1087 ; W64-O0-NEXT: v_mov_b32_e32 v4, v14
1088 ; W64-O0-NEXT: v_mov_b32_e32 v6, v13
1089 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1090 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1091 ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
1092 ; W64-O0-NEXT: v_mov_b32_e32 v9, v10
1093 ; W64-O0-NEXT: v_mov_b32_e32 v13, v9
1094 ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
1095 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1096 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1097 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1098 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1099 ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
1100 ; W64-O0-NEXT: v_mov_b32_e32 v9, v13
1101 ; W64-O0-NEXT: v_mov_b32_e32 v10, v6
1102 ; W64-O0-NEXT: v_mov_b32_e32 v11, v4
1103 ; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
1104 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1105 ; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
1106 ; W64-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
1107 ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
1108 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1109 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1110 ; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
1111 ; W64-O0-NEXT: v_mov_b32_e32 v6, v7
1112 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1113 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1114 ; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
1115 ; W64-O0-NEXT: v_mov_b32_e32 v4, v2
1116 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1117 ; W64-O0-NEXT: ; implicit-def: $sgpr4
1118 ; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
1119 ; W64-O0-NEXT: v_mov_b32_e32 v2, v12
1120 ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1121 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1122 ; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
1123 ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
1124 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1125 ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
1126 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
1127 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
1128 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
1129 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1130 ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
1131 ; W64-O0-NEXT: ;;#ASMSTART
1132 ; W64-O0-NEXT: s_mov_b32 s4, 17
1133 ; W64-O0-NEXT: ;;#ASMEND
1134 ; W64-O0-NEXT: s_mov_b32 s5, s4
1135 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 0
1136 ; W64-O0-NEXT: s_mov_b32 s5, 0
1137 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 1
1138 ; W64-O0-NEXT: v_mov_b32_e32 v1, s4
1139 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
1140 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec
1141 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 2
1142 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 3
1143 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1144 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1145 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1146 ; W64-O0-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
1147 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1148 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1149 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1150 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1151 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
1152 ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
1153 ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
1154 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1155 ; W64-O0-NEXT: v_readfirstlane_b32 s8, v1
1156 ; W64-O0-NEXT: v_readfirstlane_b32 s12, v2
1157 ; W64-O0-NEXT: s_mov_b32 s4, s8
1158 ; W64-O0-NEXT: s_mov_b32 s5, s12
1159 ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
1160 ; W64-O0-NEXT: v_readfirstlane_b32 s7, v3
1161 ; W64-O0-NEXT: v_readfirstlane_b32 s6, v4
1162 ; W64-O0-NEXT: s_mov_b32 s10, s7
1163 ; W64-O0-NEXT: s_mov_b32 s11, s6
1164 ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
1165 ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11]
1166 ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
1167 ; W64-O0-NEXT: s_mov_b32 s9, s12
1168 ; W64-O0-NEXT: s_mov_b32 s10, s7
1169 ; W64-O0-NEXT: s_mov_b32 s11, s6
1170 ; W64-O0-NEXT: v_writelane_b32 v0, s8, 4
1171 ; W64-O0-NEXT: v_writelane_b32 v0, s9, 5
1172 ; W64-O0-NEXT: v_writelane_b32 v0, s10, 6
1173 ; W64-O0-NEXT: v_writelane_b32 v0, s11, 7
1174 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
1175 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 8
1176 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 9
1177 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1178 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1179 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1180 ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
1181 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1182 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1183 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1184 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1185 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 8
1186 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 9
1187 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 4
1188 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 5
1189 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 6
1190 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 7
1191 ; W64-O0-NEXT: v_readlane_b32 s6, v1, 1
1192 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
1193 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1194 ; W64-O0-NEXT: s_nop 2
1195 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen
1196 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1197 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
1198 ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
1199 ; W64-O0-NEXT: s_cbranch_execnz .LBB2_1
1200 ; W64-O0-NEXT: ; %bb.3:
1201 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1202 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1203 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1204 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1205 ; W64-O0-NEXT: v_readlane_b32 s6, v0, 2
1206 ; W64-O0-NEXT: v_readlane_b32 s7, v0, 3
1207 ; W64-O0-NEXT: s_mov_b64 exec, s[6:7]
1208 ; W64-O0-NEXT: v_readlane_b32 s4, v0, 1
1209 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
1210 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
1211 ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff
1212 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1213 ; W64-O0-NEXT: v_and_b32_e64 v2, v2, s5
1214 ; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4
1215 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
1216 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec
1217 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 10
1218 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 11
1219 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1220 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1221 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1222 ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
1223 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5]
1224 ; W64-O0-NEXT: s_cbranch_execz .LBB2_8
1225 ; W64-O0-NEXT: ; %bb.4: ; %bb1
1226 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1227 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1228 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1229 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1230 ; W64-O0-NEXT: v_readlane_b32 s4, v0, 0
1231 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1232 ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1233 ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
1234 ; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1235 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1236 ; W64-O0-NEXT: v_mov_b32_e32 v7, v5
1237 ; W64-O0-NEXT: v_mov_b32_e32 v1, v4
1238 ; W64-O0-NEXT: v_mov_b32_e32 v5, v3
1239 ; W64-O0-NEXT: v_mov_b32_e32 v6, v2
1240 ; W64-O0-NEXT: ; implicit-def: $sgpr5
1241 ; W64-O0-NEXT: ; implicit-def: $sgpr5
1242 ; W64-O0-NEXT: ; implicit-def: $sgpr5
1243 ; W64-O0-NEXT: ; implicit-def: $sgpr5
1244 ; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
1245 ; W64-O0-NEXT: v_mov_b32_e32 v2, v7
1246 ; W64-O0-NEXT: v_mov_b32_e32 v3, v6
1247 ; W64-O0-NEXT: v_mov_b32_e32 v4, v5
1248 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
1249 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1250 ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
1251 ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
1252 ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
1253 ; W64-O0-NEXT: s_mov_b32 s5, 0
1254 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 12
1255 ; W64-O0-NEXT: v_mov_b32_e32 v1, s4
1256 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
1257 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec
1258 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 13
1259 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 14
1260 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1261 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1262 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1263 ; W64-O0-NEXT: .LBB2_5: ; =>This Inner Loop Header: Depth=1
1264 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1265 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1266 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1267 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
1268 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
1269 ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
1270 ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
1271 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1272 ; W64-O0-NEXT: v_readfirstlane_b32 s8, v1
1273 ; W64-O0-NEXT: v_readfirstlane_b32 s12, v2
1274 ; W64-O0-NEXT: s_mov_b32 s4, s8
1275 ; W64-O0-NEXT: s_mov_b32 s5, s12
1276 ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
1277 ; W64-O0-NEXT: v_readfirstlane_b32 s7, v3
1278 ; W64-O0-NEXT: v_readfirstlane_b32 s6, v4
1279 ; W64-O0-NEXT: s_mov_b32 s10, s7
1280 ; W64-O0-NEXT: s_mov_b32 s11, s6
1281 ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
1282 ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11]
1283 ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
1284 ; W64-O0-NEXT: s_mov_b32 s9, s12
1285 ; W64-O0-NEXT: s_mov_b32 s10, s7
1286 ; W64-O0-NEXT: s_mov_b32 s11, s6
1287 ; W64-O0-NEXT: v_writelane_b32 v0, s8, 15
1288 ; W64-O0-NEXT: v_writelane_b32 v0, s9, 16
1289 ; W64-O0-NEXT: v_writelane_b32 v0, s10, 17
1290 ; W64-O0-NEXT: v_writelane_b32 v0, s11, 18
1291 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
1292 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 19
1293 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 20
1294 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1295 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1296 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1297 ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
1298 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1299 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1300 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1301 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1302 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 19
1303 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 20
1304 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 15
1305 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 16
1306 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 17
1307 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 18
1308 ; W64-O0-NEXT: v_readlane_b32 s6, v1, 12
1309 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
1310 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1311 ; W64-O0-NEXT: s_nop 2
1312 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen
1313 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1314 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
1315 ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
1316 ; W64-O0-NEXT: s_cbranch_execnz .LBB2_5
1317 ; W64-O0-NEXT: ; %bb.7:
1318 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1319 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1320 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1321 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1322 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 13
1323 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 14
1324 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5]
1325 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
1326 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1327 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
1328 ; W64-O0-NEXT: .LBB2_8: ; %bb2
1329 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1
1330 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1331 ; W64-O0-NEXT: s_mov_b64 exec, s[16:17]
1332 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1333 ; W64-O0-NEXT: v_readlane_b32 s4, v0, 10
1334 ; W64-O0-NEXT: v_readlane_b32 s5, v0, 11
1335 ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5]
1336 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
1337 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
1338 ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
1339 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1340 ; W64-O0-NEXT: global_store_dword v[1:2], v3, off
1341 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1342 ; W64-O0-NEXT: ; kill: killed $vgpr0
1343 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
1344 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
1345 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
1346 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5]
1347 ; W64-O0-NEXT: s_waitcnt vmcnt(0)
1348 ; W64-O0-NEXT: s_setpc_b64 s[30:31]
1350 %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
1351 %val0 = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %i, i32 %live.out.reg, i32 0, i32 0, i32 0) #1
1352 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
1353 %cmp = icmp eq i32 %idx, 0
1354 br i1 %cmp, label %bb1, label %bb2
1357 %val1 = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %j, i32 %live.out.reg, i32 0, i32 0, i32 0) #1
1361 %val = phi float [ %val0, %entry ], [ %val1, %bb1 ]
1362 store volatile float %val, ptr addrspace(1) %out
1366 declare i32 @llvm.amdgcn.workitem.id.x() #1
1367 declare float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #1
1369 attributes #0 = { nounwind }
1370 attributes #1 = { nounwind readonly }