1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O0 -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O0 %s
3 ; RUN: llc -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O3 %s
5 define amdgpu_gfx void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) {
6 ; GFX9-O0-LABEL: strict_wwm_no_cfg:
8 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
10 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
11 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
12 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
13 ; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
14 ; GFX9-O0-NEXT: s_mov_b32 s8, s7
15 ; GFX9-O0-NEXT: s_mov_b32 s9, s6
16 ; GFX9-O0-NEXT: s_mov_b32 s10, s5
17 ; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
18 ; GFX9-O0-NEXT: s_mov_b32 s5, s10
19 ; GFX9-O0-NEXT: s_mov_b32 s6, s9
20 ; GFX9-O0-NEXT: s_mov_b32 s7, s8
21 ; GFX9-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7
22 ; GFX9-O0-NEXT: s_mov_b32 s8, 0
23 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[5:6], off, s[4:7], s8
24 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
25 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
26 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
27 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
28 ; GFX9-O0-NEXT: s_not_b64 exec, exec
29 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8
30 ; GFX9-O0-NEXT: s_not_b64 exec, exec
31 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
32 ; GFX9-O0-NEXT: s_not_b64 exec, exec
33 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
34 ; GFX9-O0-NEXT: s_not_b64 exec, exec
35 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
36 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
37 ; GFX9-O0-NEXT: s_nop 1
38 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
39 ; GFX9-O0-NEXT: v_add_u32_e64 v0, v0, v2
40 ; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
41 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
42 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
43 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8
44 ; GFX9-O0-NEXT: s_nop 1
45 ; GFX9-O0-NEXT: v_mov_b32_dpp v0, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
46 ; GFX9-O0-NEXT: v_add_u32_e64 v0, v1, v0
47 ; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
48 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
49 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, v4
50 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[10:11]
51 ; GFX9-O0-NEXT: s_mov_b32 s9, 1
52 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s9, v3
53 ; GFX9-O0-NEXT: s_mov_b32 s9, 2
54 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
55 ; GFX9-O0-NEXT: v_and_b32_e32 v3, v3, v4
56 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[4:7], s8 offset:4
57 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
58 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
59 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
60 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
61 ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
62 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
63 ; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
65 ; GFX9-O3-LABEL: strict_wwm_no_cfg:
67 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
69 ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
70 ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
71 ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
72 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
73 ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
74 ; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
75 ; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
76 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
77 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0
78 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
79 ; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
80 ; GFX9-O3-NEXT: s_not_b64 exec, exec
81 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
82 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
83 ; GFX9-O3-NEXT: s_not_b64 exec, exec
84 ; GFX9-O3-NEXT: s_not_b64 exec, exec
85 ; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
86 ; GFX9-O3-NEXT: s_not_b64 exec, exec
87 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
88 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
89 ; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
90 ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
91 ; GFX9-O3-NEXT: v_add_u32_e32 v0, v3, v0
92 ; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
93 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1
94 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0
95 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
96 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
97 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 1, v4
98 ; GFX9-O3-NEXT: v_and_b32_e32 v4, 2, v4
99 ; GFX9-O3-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:4
100 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
101 ; GFX9-O3-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
102 ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
103 ; GFX9-O3-NEXT: s_nop 0
104 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
105 ; GFX9-O3-NEXT: s_nop 0
106 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
107 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
108 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
109 ; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
110 %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0)
111 %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
112 %tmp102 = extractelement <2 x i32> %tmp101, i32 0
113 %tmp103 = extractelement <2 x i32> %tmp101, i32 1
114 %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
115 %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
118 %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
119 %tmp121 = add i32 %tmp105, %tmp120
120 %tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121)
122 %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
123 %tmp136 = add i32 %tmp107, %tmp135
124 %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
126 %tmp138 = icmp eq i32 %tmp122, %tmp137
127 %tmp139 = sext i1 %tmp138 to i32
128 %tmp140 = shl nsw i32 %tmp139, 1
129 %tmp141 = and i32 %tmp140, 2
130 %tmp145 = bitcast i32 %tmp141 to float
131 call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
135 define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
136 ; GFX9-O0-LABEL: strict_wwm_cfg:
137 ; GFX9-O0: ; %bb.0: ; %entry
138 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
140 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
141 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
142 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
143 ; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
144 ; GFX9-O0-NEXT: v_writelane_b32 v5, s30, 0
145 ; GFX9-O0-NEXT: v_writelane_b32 v5, s31, 1
146 ; GFX9-O0-NEXT: s_mov_b32 s8, s4
147 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
148 ; GFX9-O0-NEXT: s_mov_b32 s9, s5
149 ; GFX9-O0-NEXT: s_mov_b32 s10, s6
150 ; GFX9-O0-NEXT: s_mov_b32 s11, s7
151 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
152 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[10:11]
153 ; GFX9-O0-NEXT: v_writelane_b32 v5, s4, 2
154 ; GFX9-O0-NEXT: v_writelane_b32 v5, s5, 3
155 ; GFX9-O0-NEXT: v_writelane_b32 v5, s6, 4
156 ; GFX9-O0-NEXT: v_writelane_b32 v5, s7, 5
157 ; GFX9-O0-NEXT: s_mov_b32 s4, 0
158 ; GFX9-O0-NEXT: s_nop 0
159 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[8:11], s4
160 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
161 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
162 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
163 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
164 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7
165 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
166 ; GFX9-O0-NEXT: s_not_b64 exec, exec
167 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4
168 ; GFX9-O0-NEXT: s_not_b64 exec, exec
169 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
170 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
171 ; GFX9-O0-NEXT: s_nop 1
172 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
173 ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2
174 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
175 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
176 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
177 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s4
178 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
179 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
180 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec
181 ; GFX9-O0-NEXT: v_writelane_b32 v5, s4, 6
182 ; GFX9-O0-NEXT: v_writelane_b32 v5, s5, 7
183 ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
184 ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
185 ; GFX9-O0-NEXT: s_cbranch_execz BB1_2
186 ; GFX9-O0-NEXT: ; %bb.1: ; %if
187 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
188 ; GFX9-O0-NEXT: s_nop 0
189 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
190 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
191 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
192 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
193 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
194 ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
195 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
196 ; GFX9-O0-NEXT: s_not_b64 exec, exec
197 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
198 ; GFX9-O0-NEXT: s_not_b64 exec, exec
199 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
200 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
201 ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1
202 ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
203 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
204 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
205 ; GFX9-O0-NEXT: BB1_2: ; %merge
206 ; GFX9-O0-NEXT: v_readlane_b32 s6, v5, 6
207 ; GFX9-O0-NEXT: v_readlane_b32 s7, v5, 7
208 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[6:7]
209 ; GFX9-O0-NEXT: v_readlane_b32 s4, v5, 0
210 ; GFX9-O0-NEXT: v_readlane_b32 s5, v5, 1
211 ; GFX9-O0-NEXT: v_readlane_b32 s8, v5, 2
212 ; GFX9-O0-NEXT: v_readlane_b32 s9, v5, 3
213 ; GFX9-O0-NEXT: v_readlane_b32 s10, v5, 4
214 ; GFX9-O0-NEXT: v_readlane_b32 s11, v5, 5
215 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
216 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
217 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
218 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, v3
219 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
220 ; GFX9-O0-NEXT: s_mov_b32 s6, 1
221 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s6, v0
222 ; GFX9-O0-NEXT: s_mov_b32 s6, 2
223 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s6
224 ; GFX9-O0-NEXT: v_and_b32_e32 v0, v0, v3
225 ; GFX9-O0-NEXT: s_mov_b32 s6, 0
226 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[8:11], s6 offset:4
227 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
228 ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
229 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
230 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
231 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
232 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
233 ; GFX9-O0-NEXT: s_setpc_b64 s[4:5]
235 ; GFX9-O3-LABEL: strict_wwm_cfg:
236 ; GFX9-O3: ; %bb.0: ; %entry
237 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
239 ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
240 ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
241 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
242 ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
243 ; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
244 ; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
245 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
246 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
247 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
248 ; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
249 ; GFX9-O3-NEXT: s_not_b64 exec, exec
250 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
251 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
252 ; GFX9-O3-NEXT: s_not_b64 exec, exec
253 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
254 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
255 ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
256 ; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
257 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1
258 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
259 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[8:9], vcc
260 ; GFX9-O3-NEXT: ; %bb.1: ; %if
261 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1
262 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
263 ; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11]
264 ; GFX9-O3-NEXT: s_not_b64 exec, exec
265 ; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
266 ; GFX9-O3-NEXT: s_not_b64 exec, exec
267 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1
268 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
269 ; GFX9-O3-NEXT: v_add_u32_e32 v1, v3, v1
270 ; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11]
271 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
272 ; GFX9-O3-NEXT: ; %bb.2: ; %merge
273 ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[8:9]
274 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
275 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
276 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0
277 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0
278 ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
279 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
280 ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
281 ; GFX9-O3-NEXT: s_nop 0
282 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
283 ; GFX9-O3-NEXT: s_nop 0
284 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
285 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
286 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
287 ; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
289 %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0)
290 %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
291 %tmp102 = extractelement <2 x i32> %tmp101, i32 0
292 %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
294 %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
295 %tmp121 = add i32 %tmp105, %tmp120
296 %tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121)
298 %cond = icmp eq i32 %arg, 0
299 br i1 %cond, label %if, label %merge
301 %tmp103 = extractelement <2 x i32> %tmp101, i32 1
302 %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
304 %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
305 %tmp136 = add i32 %tmp107, %tmp135
306 %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
310 %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ]
311 %tmp138 = icmp eq i32 %tmp122, %merge_value
312 %tmp139 = sext i1 %tmp138 to i32
313 %tmp140 = shl nsw i32 %tmp139, 1
314 %tmp141 = and i32 %tmp140, 2
315 %tmp145 = bitcast i32 %tmp141 to float
316 call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
320 define hidden i32 @strict_wwm_called(i32 %a) noinline {
321 ; GFX9-O0-LABEL: strict_wwm_called:
323 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
324 ; GFX9-O0-NEXT: v_add_u32_e64 v1, v0, v0
325 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4
326 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4
327 ; GFX9-O0-NEXT: v_mul_lo_u32 v0, v1, v0
328 ; GFX9-O0-NEXT: v_sub_u32_e64 v0, v0, v1
329 ; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
331 ; GFX9-O3-LABEL: strict_wwm_called:
333 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
334 ; GFX9-O3-NEXT: v_add_u32_e32 v1, v0, v0
335 ; GFX9-O3-NEXT: v_mul_lo_u32 v0, v1, v0
336 ; GFX9-O3-NEXT: v_sub_u32_e32 v0, v0, v1
337 ; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
338 %add = add i32 %a, %a
339 %mul = mul i32 %add, %a
340 %sub = sub i32 %mul, %add
344 define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) {
345 ; GFX9-O0-LABEL: strict_wwm_call:
347 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
348 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
349 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
350 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
351 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
352 ; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
353 ; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 7
354 ; GFX9-O0-NEXT: s_mov_b32 s33, s32
355 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400
356 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0
357 ; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1
358 ; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 2
359 ; GFX9-O0-NEXT: s_mov_b32 s8, s4
360 ; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 2
361 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
362 ; GFX9-O0-NEXT: s_mov_b32 s9, s5
363 ; GFX9-O0-NEXT: s_mov_b32 s10, s6
364 ; GFX9-O0-NEXT: s_mov_b32 s11, s7
365 ; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 3
366 ; GFX9-O0-NEXT: v_writelane_b32 v3, s9, 4
367 ; GFX9-O0-NEXT: v_writelane_b32 v3, s10, 5
368 ; GFX9-O0-NEXT: v_writelane_b32 v3, s11, 6
369 ; GFX9-O0-NEXT: s_mov_b32 s8, 0
370 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
371 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
372 ; GFX9-O0-NEXT: s_not_b64 exec, exec
373 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
374 ; GFX9-O0-NEXT: s_not_b64 exec, exec
375 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
376 ; GFX9-O0-NEXT: s_getpc_b64 s[4:5]
377 ; GFX9-O0-NEXT: s_add_u32 s4, s4, strict_wwm_called@rel32@lo+4
378 ; GFX9-O0-NEXT: s_addc_u32 s5, s5, strict_wwm_called@rel32@hi+12
379 ; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[2:3]
380 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[0:1]
381 ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[12:13]
382 ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[14:15]
383 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
384 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[4:5]
385 ; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 3
386 ; GFX9-O0-NEXT: v_readlane_b32 s5, v3, 4
387 ; GFX9-O0-NEXT: v_readlane_b32 s6, v3, 5
388 ; GFX9-O0-NEXT: v_readlane_b32 s7, v3, 6
389 ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0
390 ; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1
391 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
392 ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2
393 ; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
394 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
395 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[4:7], s8 offset:4
396 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00
397 ; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 7
398 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
399 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
400 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
401 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
402 ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
403 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
404 ; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
406 ; GFX9-O3-LABEL: strict_wwm_call:
408 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1
410 ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
411 ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
412 ; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11]
413 ; GFX9-O3-NEXT: s_mov_b32 s14, s33
414 ; GFX9-O3-NEXT: s_mov_b32 s33, s32
415 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400
416 ; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[30:31]
417 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8
418 ; GFX9-O3-NEXT: s_not_b64 exec, exec
419 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
420 ; GFX9-O3-NEXT: s_not_b64 exec, exec
421 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
422 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
423 ; GFX9-O3-NEXT: s_getpc_b64 s[12:13]
424 ; GFX9-O3-NEXT: s_add_u32 s12, s12, strict_wwm_called@rel32@lo+4
425 ; GFX9-O3-NEXT: s_addc_u32 s13, s13, strict_wwm_called@rel32@hi+12
426 ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[12:13]
427 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0
428 ; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2
429 ; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
430 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1
431 ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
432 ; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00
433 ; GFX9-O3-NEXT: s_mov_b32 s33, s14
434 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
435 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
436 ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
437 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
438 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
439 ; GFX9-O3-NEXT: s_setpc_b64 s[10:11]
440 %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
441 %tmp134 = call i32 @strict_wwm_called(i32 %tmp107)
442 %tmp136 = add i32 %tmp134, %tmp107
443 %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
444 call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
448 define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
449 ; GFX9-O0-LABEL: strict_wwm_called_i64:
451 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
452 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0
453 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
454 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1
455 ; GFX9-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr6_vgpr7 killed $exec
456 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
457 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
458 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
459 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
460 ; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[4:5], v2, v3
461 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v1, s[4:5]
462 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
463 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
464 ; GFX9-O0-NEXT: s_mov_b32 s4, 32
465 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
466 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
467 ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
468 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
469 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
470 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5
471 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5
472 ; GFX9-O0-NEXT: v_mul_lo_u32 v2, v0, v1
473 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
474 ; GFX9-O0-NEXT: v_mul_hi_u32 v1, v0, v6
475 ; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[4:5]
476 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7
477 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5
478 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5
479 ; GFX9-O0-NEXT: v_mul_lo_u32 v3, v3, v6
480 ; GFX9-O0-NEXT: v_add3_u32 v1, v1, v2, v3
481 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5
482 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6
483 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s5
484 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
485 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
486 ; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[1:2]
487 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
488 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5
489 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5
490 ; GFX9-O0-NEXT: v_mul_lo_u32 v6, v0, v6
491 ; GFX9-O0-NEXT: s_mov_b32 s5, 0
492 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0
493 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
494 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0
495 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
496 ; GFX9-O0-NEXT: v_or_b32_e32 v0, v0, v3
497 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
498 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
499 ; GFX9-O0-NEXT: v_or_b32_e32 v6, v1, v2
500 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
501 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0
502 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
503 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
504 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
505 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
506 ; GFX9-O0-NEXT: v_sub_co_u32_e64 v1, s[6:7], v1, v3
507 ; GFX9-O0-NEXT: v_subb_co_u32_e64 v0, s[6:7], v0, v2, s[6:7]
508 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
509 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
510 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
511 ; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2]
512 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
513 ; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
515 ; GFX9-O3-LABEL: strict_wwm_called_i64:
517 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
518 ; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v0
519 ; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v1, vcc
520 ; GFX9-O3-NEXT: v_mul_lo_u32 v4, v3, v0
521 ; GFX9-O3-NEXT: v_mul_hi_u32 v5, v2, v0
522 ; GFX9-O3-NEXT: v_mul_lo_u32 v1, v2, v1
523 ; GFX9-O3-NEXT: v_mul_lo_u32 v0, v2, v0
524 ; GFX9-O3-NEXT: v_add3_u32 v1, v5, v1, v4
525 ; GFX9-O3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
526 ; GFX9-O3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
527 ; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
528 %add = add i64 %a, %a
529 %mul = mul i64 %add, %a
530 %sub = sub i64 %mul, %add
534 define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) {
535 ; GFX9-O0-LABEL: strict_wwm_call_i64:
537 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
539 ; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
540 ; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
541 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
542 ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
543 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
544 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
545 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
546 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
547 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
548 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
549 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
550 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
551 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
552 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
553 ; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
554 ; GFX9-O0-NEXT: v_writelane_b32 v11, s33, 9
555 ; GFX9-O0-NEXT: s_mov_b32 s33, s32
556 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00
557 ; GFX9-O0-NEXT: v_writelane_b32 v11, s30, 0
558 ; GFX9-O0-NEXT: v_writelane_b32 v11, s31, 1
559 ; GFX9-O0-NEXT: v_writelane_b32 v11, s9, 2
560 ; GFX9-O0-NEXT: v_writelane_b32 v11, s8, 3
561 ; GFX9-O0-NEXT: s_mov_b32 s8, s6
562 ; GFX9-O0-NEXT: v_readlane_b32 s6, v11, 3
563 ; GFX9-O0-NEXT: v_writelane_b32 v11, s8, 4
564 ; GFX9-O0-NEXT: s_mov_b32 s12, s5
565 ; GFX9-O0-NEXT: v_readlane_b32 s5, v11, 4
566 ; GFX9-O0-NEXT: s_mov_b32 s8, s4
567 ; GFX9-O0-NEXT: v_readlane_b32 s4, v11, 2
568 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
569 ; GFX9-O0-NEXT: s_mov_b32 s9, s12
570 ; GFX9-O0-NEXT: s_mov_b32 s10, s5
571 ; GFX9-O0-NEXT: s_mov_b32 s11, s7
572 ; GFX9-O0-NEXT: v_writelane_b32 v11, s8, 5
573 ; GFX9-O0-NEXT: v_writelane_b32 v11, s9, 6
574 ; GFX9-O0-NEXT: v_writelane_b32 v11, s10, 7
575 ; GFX9-O0-NEXT: v_writelane_b32 v11, s11, 8
576 ; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
577 ; GFX9-O0-NEXT: s_mov_b32 s7, s4
578 ; GFX9-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr6_sgpr7
579 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0
580 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6
581 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
582 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1
583 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0
584 ; GFX9-O0-NEXT: s_not_b64 exec, exec
585 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s4
586 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5
587 ; GFX9-O0-NEXT: s_not_b64 exec, exec
588 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
589 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9
590 ; GFX9-O0-NEXT: s_mov_b32 s4, 32
591 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7
592 ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[9:10]
593 ; GFX9-O0-NEXT: s_getpc_b64 s[4:5]
594 ; GFX9-O0-NEXT: s_add_u32 s4, s4, strict_wwm_called_i64@gotpcrel32@lo+4
595 ; GFX9-O0-NEXT: s_addc_u32 s5, s5, strict_wwm_called_i64@gotpcrel32@hi+12
596 ; GFX9-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
597 ; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[2:3]
598 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[0:1]
599 ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[12:13]
600 ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[14:15]
601 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
602 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
603 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
604 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[4:5]
605 ; GFX9-O0-NEXT: v_readlane_b32 s4, v11, 5
606 ; GFX9-O0-NEXT: v_readlane_b32 s5, v11, 6
607 ; GFX9-O0-NEXT: v_readlane_b32 s6, v11, 7
608 ; GFX9-O0-NEXT: v_readlane_b32 s7, v11, 8
609 ; GFX9-O0-NEXT: v_readlane_b32 s30, v11, 0
610 ; GFX9-O0-NEXT: v_readlane_b32 s31, v11, 1
611 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
612 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
613 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v9
614 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
615 ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[10:11], v2, v4
616 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v5, s[10:11]
617 ; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
618 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
619 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
620 ; GFX9-O0-NEXT: s_mov_b32 s8, 0
621 ; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], s8 offset:4
622 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400
623 ; GFX9-O0-NEXT: v_readlane_b32 s33, v11, 9
624 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
625 ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
626 ; GFX9-O0-NEXT: s_nop 0
627 ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
628 ; GFX9-O0-NEXT: s_nop 0
629 ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
630 ; GFX9-O0-NEXT: s_nop 0
631 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
632 ; GFX9-O0-NEXT: s_nop 0
633 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
634 ; GFX9-O0-NEXT: s_nop 0
635 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
636 ; GFX9-O0-NEXT: s_nop 0
637 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
638 ; GFX9-O0-NEXT: s_nop 0
639 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
640 ; GFX9-O0-NEXT: s_nop 0
641 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
642 ; GFX9-O0-NEXT: s_nop 0
643 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
644 ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
645 ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
646 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
647 ; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
649 ; GFX9-O3-LABEL: strict_wwm_call_i64:
651 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1
653 ; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
654 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
655 ; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
656 ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
657 ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
658 ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
659 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
660 ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
661 ; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11]
662 ; GFX9-O3-NEXT: s_mov_b32 s14, s33
663 ; GFX9-O3-NEXT: s_mov_b32 s33, s32
664 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800
665 ; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[30:31]
666 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8
667 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9
668 ; GFX9-O3-NEXT: s_not_b64 exec, exec
669 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
670 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
671 ; GFX9-O3-NEXT: s_not_b64 exec, exec
672 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
673 ; GFX9-O3-NEXT: s_getpc_b64 s[12:13]
674 ; GFX9-O3-NEXT: s_add_u32 s12, s12, strict_wwm_called_i64@gotpcrel32@lo+4
675 ; GFX9-O3-NEXT: s_addc_u32 s13, s13, strict_wwm_called_i64@gotpcrel32@hi+12
676 ; GFX9-O3-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0
677 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
678 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
679 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
680 ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[12:13]
681 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0
682 ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
683 ; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
684 ; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
685 ; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
686 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
687 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3
688 ; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
689 ; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800
690 ; GFX9-O3-NEXT: s_mov_b32 s33, s14
691 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
692 ; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
693 ; GFX9-O3-NEXT: s_nop 0
694 ; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
695 ; GFX9-O3-NEXT: s_nop 0
696 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
697 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
698 ; GFX9-O3-NEXT: s_nop 0
699 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
700 ; GFX9-O3-NEXT: s_nop 0
701 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
702 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
703 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
704 ; GFX9-O3-NEXT: s_setpc_b64 s[10:11]
705 %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
706 %tmp134 = call i64 @strict_wwm_called_i64(i64 %tmp107)
707 %tmp136 = add i64 %tmp134, %tmp107
708 %tmp137 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp136)
709 %tmp138 = bitcast i64 %tmp137 to <2 x i32>
710 call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %tmp138, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
714 define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
715 ; GFX9-O0-LABEL: strict_wwm_amdgpu_cs_main:
717 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
718 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
719 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
720 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
721 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
722 ; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
723 ; GFX9-O0-NEXT: s_mov_b32 s8, s7
724 ; GFX9-O0-NEXT: s_mov_b32 s9, s6
725 ; GFX9-O0-NEXT: s_mov_b32 s10, s5
726 ; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
727 ; GFX9-O0-NEXT: s_mov_b32 s5, s10
728 ; GFX9-O0-NEXT: s_mov_b32 s6, s9
729 ; GFX9-O0-NEXT: s_mov_b32 s7, s8
730 ; GFX9-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7
731 ; GFX9-O0-NEXT: s_mov_b32 s8, 5
732 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s8, v0
733 ; GFX9-O0-NEXT: s_mov_b32 s8, 0
734 ; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[4:7], s8 offen
735 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[4:7], s8 offen offset:16
736 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
737 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
738 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
739 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
740 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
741 ; GFX9-O0-NEXT: s_mov_b32 s9, 0x7fffffff
742 ; GFX9-O0-NEXT: s_mov_b32 s10, -1
743 ; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
744 ; GFX9-O0-NEXT: s_mov_b32 s11, s9
745 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
746 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
747 ; GFX9-O0-NEXT: s_not_b64 exec, exec
748 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
749 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11
750 ; GFX9-O0-NEXT: s_not_b64 exec, exec
751 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
752 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
753 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
754 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
755 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
756 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
757 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
758 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
759 ; GFX9-O0-NEXT: s_not_b64 exec, exec
760 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
761 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11
762 ; GFX9-O0-NEXT: s_not_b64 exec, exec
763 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
764 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
765 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
766 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
767 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
768 ; GFX9-O0-NEXT: s_not_b64 exec, exec
769 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
770 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11
771 ; GFX9-O0-NEXT: s_not_b64 exec, exec
772 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
773 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
774 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
775 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
776 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
777 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6
778 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
779 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
780 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
781 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
782 ; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[4:7], s8 offen
783 ; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[4:7], s8 offen offset:16
784 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
785 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
786 ; GFX9-O0-NEXT: s_nop 0
787 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
788 ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
789 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
790 ; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
792 ; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main:
794 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
795 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
796 ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
797 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
798 ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
799 ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
800 ; GFX9-O3-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
801 ; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
802 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
803 ; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
804 ; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
805 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
806 ; GFX9-O3-NEXT: buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen
807 ; GFX9-O3-NEXT: buffer_load_dwordx2 v[5:6], v0, s[4:7], 0 offen offset:16
808 ; GFX9-O3-NEXT: s_mov_b32 s8, -1
809 ; GFX9-O3-NEXT: s_brev_b32 s9, -2
810 ; GFX9-O3-NEXT: s_not_b64 exec, exec
811 ; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
812 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, s8
813 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s9
814 ; GFX9-O3-NEXT: s_not_b64 exec, exec
815 ; GFX9-O3-NEXT: s_not_b64 exec, exec
816 ; GFX9-O3-NEXT: v_mov_b32_e32 v3, s8
817 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, s9
818 ; GFX9-O3-NEXT: s_not_b64 exec, exec
819 ; GFX9-O3-NEXT: s_not_b64 exec, exec
820 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
821 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, s8
822 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s9
823 ; GFX9-O3-NEXT: s_not_b64 exec, exec
824 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1
825 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
826 ; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5
827 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2
828 ; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4
829 ; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6
830 ; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen
831 ; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
832 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
833 ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
834 ; GFX9-O3-NEXT: s_nop 0
835 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
836 ; GFX9-O3-NEXT: s_nop 0
837 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
838 ; GFX9-O3-NEXT: s_nop 0
839 ; GFX9-O3-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
840 ; GFX9-O3-NEXT: s_nop 0
841 ; GFX9-O3-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
842 ; GFX9-O3-NEXT: s_nop 0
843 ; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
844 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
845 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
846 ; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
847 %tmp17 = shl i32 %index, 5
848 %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
849 %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64>
850 %tmp19 = or i32 %tmp17, 16
851 %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0)
852 %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0
853 %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807)
854 %tmp97 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp22)
855 %.i1.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 1
856 %tmp99 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i1.upto1.extract, i64 9223372036854775807)
857 %tmp174 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp99)
858 %.i25 = bitcast <2 x i32> %tmp20 to i64
859 %tmp176 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i25, i64 9223372036854775807)
860 %tmp251 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp176)
861 %.cast = bitcast i64 %tmp97 to <2 x float>
862 %.cast6 = bitcast i64 %tmp174 to <2 x float>
863 %.cast7 = bitcast i64 %tmp251 to <2 x float>
864 %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
865 tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %tmp254, <4 x i32> %desc, i32 %tmp17, i32 0, i32 0)
866 tail call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %.cast7, <4 x i32> %desc, i32 %tmp19, i32 0, i32 0)
870 declare i32 @llvm.amdgcn.strict.wwm.i32(i32)
871 declare i64 @llvm.amdgcn.strict.wwm.i64(i64)
872 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)
873 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64)
874 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
875 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32)
876 declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32)
877 declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32)
878 declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32)
879 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32)
880 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32)
881 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
882 declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)