1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s
4 define void @main(i1 %arg) #0 {
6 ; CHECK: ; %bb.0: ; %bb
7 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
9 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
10 ; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
11 ; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
12 ; CHECK-NEXT: s_mov_b64 exec, s[4:5]
13 ; CHECK-NEXT: v_writelane_b32 v1, s30, 0
14 ; CHECK-NEXT: v_writelane_b32 v1, s31, 1
15 ; CHECK-NEXT: v_writelane_b32 v1, s36, 2
16 ; CHECK-NEXT: v_writelane_b32 v1, s37, 3
17 ; CHECK-NEXT: v_writelane_b32 v1, s38, 4
18 ; CHECK-NEXT: v_writelane_b32 v1, s39, 5
19 ; CHECK-NEXT: v_writelane_b32 v1, s40, 6
20 ; CHECK-NEXT: v_writelane_b32 v1, s41, 7
21 ; CHECK-NEXT: v_writelane_b32 v1, s42, 8
22 ; CHECK-NEXT: v_writelane_b32 v1, s43, 9
23 ; CHECK-NEXT: v_writelane_b32 v1, s44, 10
24 ; CHECK-NEXT: v_writelane_b32 v1, s45, 11
25 ; CHECK-NEXT: v_writelane_b32 v1, s46, 12
26 ; CHECK-NEXT: v_writelane_b32 v1, s47, 13
27 ; CHECK-NEXT: v_writelane_b32 v1, s48, 14
28 ; CHECK-NEXT: v_writelane_b32 v1, s49, 15
29 ; CHECK-NEXT: s_getpc_b64 s[24:25]
30 ; CHECK-NEXT: v_writelane_b32 v1, s50, 16
31 ; CHECK-NEXT: s_movk_i32 s4, 0xf0
32 ; CHECK-NEXT: s_mov_b32 s5, s24
33 ; CHECK-NEXT: v_writelane_b32 v1, s51, 17
34 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
35 ; CHECK-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
36 ; CHECK-NEXT: s_mov_b64 s[4:5], 0
37 ; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0
38 ; CHECK-NEXT: s_movk_i32 s4, 0x130
39 ; CHECK-NEXT: s_mov_b32 s5, s24
40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
41 ; CHECK-NEXT: v_writelane_b32 v5, s36, 0
42 ; CHECK-NEXT: v_writelane_b32 v5, s37, 1
43 ; CHECK-NEXT: v_writelane_b32 v5, s38, 2
44 ; CHECK-NEXT: v_writelane_b32 v5, s39, 3
45 ; CHECK-NEXT: v_writelane_b32 v5, s40, 4
46 ; CHECK-NEXT: v_writelane_b32 v5, s41, 5
47 ; CHECK-NEXT: v_writelane_b32 v5, s42, 6
48 ; CHECK-NEXT: v_writelane_b32 v5, s43, 7
49 ; CHECK-NEXT: v_writelane_b32 v5, s44, 8
50 ; CHECK-NEXT: v_writelane_b32 v5, s45, 9
51 ; CHECK-NEXT: v_writelane_b32 v5, s46, 10
52 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0
53 ; CHECK-NEXT: v_writelane_b32 v5, s47, 11
54 ; CHECK-NEXT: v_writelane_b32 v5, s48, 12
55 ; CHECK-NEXT: v_writelane_b32 v5, s49, 13
56 ; CHECK-NEXT: s_mov_b32 s20, 0
57 ; CHECK-NEXT: v_mov_b32_e32 v2, 0
58 ; CHECK-NEXT: v_writelane_b32 v5, s50, 14
59 ; CHECK-NEXT: v_mov_b32_e32 v6, s28
60 ; CHECK-NEXT: v_mov_b32_e32 v7, v2
61 ; CHECK-NEXT: s_mov_b32 s21, s20
62 ; CHECK-NEXT: s_mov_b32 s22, s20
63 ; CHECK-NEXT: s_mov_b32 s23, s20
64 ; CHECK-NEXT: v_writelane_b32 v5, s51, 15
65 ; CHECK-NEXT: v_mov_b32_e32 v3, v2
66 ; CHECK-NEXT: image_sample_lz v6, v[6:7], s[44:51], s[20:23] dmask:0x1
67 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
68 ; CHECK-NEXT: v_writelane_b32 v5, s4, 16
69 ; CHECK-NEXT: v_writelane_b32 v5, s5, 17
70 ; CHECK-NEXT: v_writelane_b32 v5, s6, 18
71 ; CHECK-NEXT: v_writelane_b32 v5, s7, 19
72 ; CHECK-NEXT: v_writelane_b32 v5, s8, 20
73 ; CHECK-NEXT: v_writelane_b32 v5, s9, 21
74 ; CHECK-NEXT: image_sample_lz v7, v[2:3], s[4:11], s[20:23] dmask:0x1
75 ; CHECK-NEXT: v_writelane_b32 v5, s10, 22
76 ; CHECK-NEXT: v_writelane_b32 v5, s11, 23
77 ; CHECK-NEXT: v_writelane_b32 v5, s12, 24
78 ; CHECK-NEXT: v_writelane_b32 v5, s13, 25
79 ; CHECK-NEXT: v_writelane_b32 v5, s14, 26
80 ; CHECK-NEXT: v_writelane_b32 v5, s15, 27
81 ; CHECK-NEXT: v_writelane_b32 v5, s16, 28
82 ; CHECK-NEXT: v_writelane_b32 v1, s52, 18
83 ; CHECK-NEXT: v_writelane_b32 v5, s17, 29
84 ; CHECK-NEXT: v_writelane_b32 v1, s53, 19
85 ; CHECK-NEXT: v_writelane_b32 v5, s18, 30
86 ; CHECK-NEXT: v_writelane_b32 v1, s54, 20
87 ; CHECK-NEXT: v_writelane_b32 v5, s19, 31
88 ; CHECK-NEXT: s_mov_b32 s4, 48
89 ; CHECK-NEXT: s_mov_b32 s5, s24
90 ; CHECK-NEXT: v_writelane_b32 v1, s55, 21
91 ; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
92 ; CHECK-NEXT: v_writelane_b32 v1, s56, 22
93 ; CHECK-NEXT: v_writelane_b32 v1, s57, 23
94 ; CHECK-NEXT: v_writelane_b32 v1, s58, 24
95 ; CHECK-NEXT: v_writelane_b32 v1, s59, 25
96 ; CHECK-NEXT: v_writelane_b32 v1, s60, 26
97 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
98 ; CHECK-NEXT: v_writelane_b32 v5, s4, 32
99 ; CHECK-NEXT: v_writelane_b32 v1, s61, 27
100 ; CHECK-NEXT: v_writelane_b32 v5, s5, 33
101 ; CHECK-NEXT: v_writelane_b32 v1, s62, 28
102 ; CHECK-NEXT: v_writelane_b32 v5, s6, 34
103 ; CHECK-NEXT: v_writelane_b32 v1, s63, 29
104 ; CHECK-NEXT: v_writelane_b32 v5, s7, 35
105 ; CHECK-NEXT: v_writelane_b32 v1, s64, 30
106 ; CHECK-NEXT: v_writelane_b32 v5, s8, 36
107 ; CHECK-NEXT: v_writelane_b32 v1, s65, 31
108 ; CHECK-NEXT: v_writelane_b32 v5, s9, 37
109 ; CHECK-NEXT: v_writelane_b32 v1, s66, 32
110 ; CHECK-NEXT: s_movk_i32 s26, 0x1f0
111 ; CHECK-NEXT: s_movk_i32 s28, 0x2f0
112 ; CHECK-NEXT: s_mov_b32 s27, s24
113 ; CHECK-NEXT: s_mov_b32 s29, s24
114 ; CHECK-NEXT: v_writelane_b32 v5, s10, 38
115 ; CHECK-NEXT: v_writelane_b32 v1, s67, 33
116 ; CHECK-NEXT: v_writelane_b32 v5, s11, 39
117 ; CHECK-NEXT: s_load_dwordx16 s[52:67], s[26:27], 0x0
118 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[28:29], 0x0
119 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
120 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
121 ; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1
122 ; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
123 ; CHECK-NEXT: s_waitcnt vmcnt(0)
124 ; CHECK-NEXT: v_mul_f32_e32 v0, v7, v6
125 ; CHECK-NEXT: s_and_saveexec_b64 s[26:27], s[24:25]
126 ; CHECK-NEXT: s_xor_b64 s[26:27], exec, s[26:27]
127 ; CHECK-NEXT: s_cbranch_execz .LBB0_3
128 ; CHECK-NEXT: ; %bb.1: ; %bb48
129 ; CHECK-NEXT: v_readlane_b32 s36, v5, 0
130 ; CHECK-NEXT: v_readlane_b32 s44, v5, 8
131 ; CHECK-NEXT: v_readlane_b32 s45, v5, 9
132 ; CHECK-NEXT: v_readlane_b32 s46, v5, 10
133 ; CHECK-NEXT: v_readlane_b32 s47, v5, 11
134 ; CHECK-NEXT: v_readlane_b32 s48, v5, 12
135 ; CHECK-NEXT: v_readlane_b32 s49, v5, 13
136 ; CHECK-NEXT: v_readlane_b32 s50, v5, 14
137 ; CHECK-NEXT: v_readlane_b32 s51, v5, 15
138 ; CHECK-NEXT: s_and_b64 vcc, exec, -1
139 ; CHECK-NEXT: v_readlane_b32 s37, v5, 1
140 ; CHECK-NEXT: v_readlane_b32 s38, v5, 2
141 ; CHECK-NEXT: v_readlane_b32 s39, v5, 3
142 ; CHECK-NEXT: v_readlane_b32 s40, v5, 4
143 ; CHECK-NEXT: image_sample_lz v6, v[2:3], s[44:51], s[20:23] dmask:0x1
144 ; CHECK-NEXT: v_mov_b32_e32 v3, 0
145 ; CHECK-NEXT: v_readlane_b32 s41, v5, 5
146 ; CHECK-NEXT: v_readlane_b32 s42, v5, 6
147 ; CHECK-NEXT: v_readlane_b32 s43, v5, 7
148 ; CHECK-NEXT: .LBB0_2: ; %bb50
149 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
150 ; CHECK-NEXT: v_readlane_b32 s36, v5, 32
151 ; CHECK-NEXT: v_readlane_b32 s40, v5, 36
152 ; CHECK-NEXT: v_readlane_b32 s41, v5, 37
153 ; CHECK-NEXT: v_readlane_b32 s42, v5, 38
154 ; CHECK-NEXT: v_readlane_b32 s43, v5, 39
155 ; CHECK-NEXT: s_mov_b32 s21, s20
156 ; CHECK-NEXT: s_mov_b32 s22, s20
157 ; CHECK-NEXT: s_mov_b32 s23, s20
158 ; CHECK-NEXT: v_readlane_b32 s37, v5, 33
159 ; CHECK-NEXT: v_readlane_b32 s38, v5, 34
160 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
161 ; CHECK-NEXT: image_sample_lz v7, v[2:3], s[60:67], s[40:43] dmask:0x1
162 ; CHECK-NEXT: v_readlane_b32 s39, v5, 35
163 ; CHECK-NEXT: image_sample_lz v2, v[2:3], s[12:19], s[20:23] dmask:0x1
164 ; CHECK-NEXT: s_waitcnt vmcnt(0)
165 ; CHECK-NEXT: v_sub_f32_e32 v2, v2, v7
166 ; CHECK-NEXT: v_mul_f32_e32 v2, v2, v0
167 ; CHECK-NEXT: v_mul_f32_e32 v2, v2, v6
168 ; CHECK-NEXT: s_mov_b64 vcc, vcc
169 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
170 ; CHECK-NEXT: .LBB0_3: ; %Flow14
171 ; CHECK-NEXT: s_or_saveexec_b64 s[20:21], s[26:27]
172 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
173 ; CHECK-NEXT: v_readlane_b32 s12, v5, 32
174 ; CHECK-NEXT: v_readlane_b32 s13, v5, 33
175 ; CHECK-NEXT: v_readlane_b32 s14, v5, 34
176 ; CHECK-NEXT: v_readlane_b32 s15, v5, 35
177 ; CHECK-NEXT: v_readlane_b32 s16, v5, 36
178 ; CHECK-NEXT: v_readlane_b32 s17, v5, 37
179 ; CHECK-NEXT: v_readlane_b32 s18, v5, 38
180 ; CHECK-NEXT: v_readlane_b32 s19, v5, 39
181 ; CHECK-NEXT: v_writelane_b32 v5, s4, 56
182 ; CHECK-NEXT: v_writelane_b32 v5, s5, 57
183 ; CHECK-NEXT: v_writelane_b32 v5, s6, 58
184 ; CHECK-NEXT: v_writelane_b32 v5, s7, 59
185 ; CHECK-NEXT: v_writelane_b32 v5, s8, 60
186 ; CHECK-NEXT: v_writelane_b32 v5, s9, 61
187 ; CHECK-NEXT: v_writelane_b32 v5, s10, 62
188 ; CHECK-NEXT: v_writelane_b32 v5, s11, 63
189 ; CHECK-NEXT: v_writelane_b32 v5, s52, 40
190 ; CHECK-NEXT: v_writelane_b32 v5, s53, 41
191 ; CHECK-NEXT: v_writelane_b32 v5, s54, 42
192 ; CHECK-NEXT: v_writelane_b32 v5, s55, 43
193 ; CHECK-NEXT: v_writelane_b32 v5, s56, 44
194 ; CHECK-NEXT: v_writelane_b32 v5, s57, 45
195 ; CHECK-NEXT: v_writelane_b32 v5, s58, 46
196 ; CHECK-NEXT: v_writelane_b32 v5, s59, 47
197 ; CHECK-NEXT: v_writelane_b32 v4, s12, 0
198 ; CHECK-NEXT: v_writelane_b32 v5, s60, 48
199 ; CHECK-NEXT: v_writelane_b32 v4, s13, 1
200 ; CHECK-NEXT: v_writelane_b32 v5, s61, 49
201 ; CHECK-NEXT: v_writelane_b32 v4, s14, 2
202 ; CHECK-NEXT: v_writelane_b32 v5, s62, 50
203 ; CHECK-NEXT: v_writelane_b32 v4, s15, 3
204 ; CHECK-NEXT: v_writelane_b32 v5, s63, 51
205 ; CHECK-NEXT: v_writelane_b32 v4, s16, 4
206 ; CHECK-NEXT: v_writelane_b32 v5, s64, 52
207 ; CHECK-NEXT: v_writelane_b32 v4, s17, 5
208 ; CHECK-NEXT: v_writelane_b32 v5, s65, 53
209 ; CHECK-NEXT: v_writelane_b32 v4, s18, 6
210 ; CHECK-NEXT: v_writelane_b32 v5, s66, 54
211 ; CHECK-NEXT: v_writelane_b32 v4, s19, 7
212 ; CHECK-NEXT: v_writelane_b32 v5, s67, 55
213 ; CHECK-NEXT: s_xor_b64 exec, exec, s[20:21]
214 ; CHECK-NEXT: s_cbranch_execz .LBB0_10
215 ; CHECK-NEXT: ; %bb.4: ; %bb32
216 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[24:25]
217 ; CHECK-NEXT: s_xor_b64 s[22:23], exec, s[8:9]
218 ; CHECK-NEXT: s_cbranch_execz .LBB0_6
219 ; CHECK-NEXT: ; %bb.5: ; %bb43
220 ; CHECK-NEXT: s_mov_b32 s8, 0
221 ; CHECK-NEXT: s_mov_b32 s9, s8
222 ; CHECK-NEXT: v_mov_b32_e32 v2, s8
223 ; CHECK-NEXT: v_readlane_b32 s36, v5, 0
224 ; CHECK-NEXT: v_mov_b32_e32 v3, s9
225 ; CHECK-NEXT: s_mov_b32 s10, s8
226 ; CHECK-NEXT: s_mov_b32 s11, s8
227 ; CHECK-NEXT: v_readlane_b32 s37, v5, 1
228 ; CHECK-NEXT: v_readlane_b32 s38, v5, 2
229 ; CHECK-NEXT: v_readlane_b32 s39, v5, 3
230 ; CHECK-NEXT: v_readlane_b32 s40, v5, 4
231 ; CHECK-NEXT: v_readlane_b32 s41, v5, 5
232 ; CHECK-NEXT: v_readlane_b32 s42, v5, 6
233 ; CHECK-NEXT: v_readlane_b32 s43, v5, 7
234 ; CHECK-NEXT: v_readlane_b32 s44, v5, 8
235 ; CHECK-NEXT: v_readlane_b32 s45, v5, 9
236 ; CHECK-NEXT: v_readlane_b32 s46, v5, 10
237 ; CHECK-NEXT: v_readlane_b32 s47, v5, 11
238 ; CHECK-NEXT: v_readlane_b32 s48, v5, 12
239 ; CHECK-NEXT: v_readlane_b32 s49, v5, 13
240 ; CHECK-NEXT: v_readlane_b32 s50, v5, 14
241 ; CHECK-NEXT: v_readlane_b32 s51, v5, 15
242 ; CHECK-NEXT: image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
243 ; CHECK-NEXT: v_readlane_b32 s36, v5, 16
244 ; CHECK-NEXT: v_readlane_b32 s44, v5, 24
245 ; CHECK-NEXT: v_readlane_b32 s45, v5, 25
246 ; CHECK-NEXT: v_readlane_b32 s46, v5, 26
247 ; CHECK-NEXT: v_readlane_b32 s47, v5, 27
248 ; CHECK-NEXT: v_readlane_b32 s48, v5, 28
249 ; CHECK-NEXT: v_readlane_b32 s49, v5, 29
250 ; CHECK-NEXT: v_readlane_b32 s50, v5, 30
251 ; CHECK-NEXT: v_readlane_b32 s51, v5, 31
252 ; CHECK-NEXT: v_mov_b32_e32 v7, 0
253 ; CHECK-NEXT: v_mov_b32_e32 v8, v7
254 ; CHECK-NEXT: v_readlane_b32 s37, v5, 17
255 ; CHECK-NEXT: v_readlane_b32 s38, v5, 18
256 ; CHECK-NEXT: v_readlane_b32 s39, v5, 19
257 ; CHECK-NEXT: image_sample_lz v2, v[2:3], s[44:51], s[12:15] dmask:0x1
258 ; CHECK-NEXT: v_readlane_b32 s40, v5, 20
259 ; CHECK-NEXT: v_readlane_b32 s41, v5, 21
260 ; CHECK-NEXT: v_readlane_b32 s42, v5, 22
261 ; CHECK-NEXT: v_readlane_b32 s43, v5, 23
262 ; CHECK-NEXT: ; implicit-def: $vgpr0
263 ; CHECK-NEXT: s_waitcnt vmcnt(1)
264 ; CHECK-NEXT: buffer_store_dwordx3 v[6:8], off, s[8:11], 0
265 ; CHECK-NEXT: s_waitcnt vmcnt(1)
266 ; CHECK-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0
267 ; CHECK-NEXT: .LBB0_6: ; %Flow12
268 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[22:23]
269 ; CHECK-NEXT: s_cbranch_execz .LBB0_9
270 ; CHECK-NEXT: ; %bb.7: ; %bb33.preheader
271 ; CHECK-NEXT: s_mov_b32 s8, 0
272 ; CHECK-NEXT: s_mov_b32 s6, s8
273 ; CHECK-NEXT: v_readlane_b32 s36, v5, 40
274 ; CHECK-NEXT: s_mov_b32 s7, s8
275 ; CHECK-NEXT: v_mov_b32_e32 v2, s6
276 ; CHECK-NEXT: v_readlane_b32 s37, v5, 41
277 ; CHECK-NEXT: s_mov_b32 s9, s8
278 ; CHECK-NEXT: s_mov_b32 s10, s8
279 ; CHECK-NEXT: s_mov_b32 s11, s8
280 ; CHECK-NEXT: v_mov_b32_e32 v3, s7
281 ; CHECK-NEXT: v_readlane_b32 s38, v5, 42
282 ; CHECK-NEXT: v_readlane_b32 s39, v5, 43
283 ; CHECK-NEXT: v_readlane_b32 s40, v5, 44
284 ; CHECK-NEXT: v_readlane_b32 s41, v5, 45
285 ; CHECK-NEXT: v_readlane_b32 s42, v5, 46
286 ; CHECK-NEXT: v_readlane_b32 s43, v5, 47
287 ; CHECK-NEXT: v_readlane_b32 s44, v5, 48
288 ; CHECK-NEXT: v_readlane_b32 s45, v5, 49
289 ; CHECK-NEXT: v_readlane_b32 s46, v5, 50
290 ; CHECK-NEXT: v_readlane_b32 s47, v5, 51
291 ; CHECK-NEXT: v_readlane_b32 s48, v5, 52
292 ; CHECK-NEXT: v_readlane_b32 s49, v5, 53
293 ; CHECK-NEXT: v_readlane_b32 s50, v5, 54
294 ; CHECK-NEXT: v_readlane_b32 s51, v5, 55
295 ; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37]
296 ; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39]
297 ; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41]
298 ; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43]
299 ; CHECK-NEXT: image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
300 ; CHECK-NEXT: v_readlane_b32 s36, v5, 56
301 ; CHECK-NEXT: v_readlane_b32 s37, v5, 57
302 ; CHECK-NEXT: v_readlane_b32 s38, v5, 58
303 ; CHECK-NEXT: v_readlane_b32 s39, v5, 59
304 ; CHECK-NEXT: v_readlane_b32 s40, v5, 60
305 ; CHECK-NEXT: v_readlane_b32 s41, v5, 61
306 ; CHECK-NEXT: v_readlane_b32 s42, v5, 62
307 ; CHECK-NEXT: v_readlane_b32 s43, v5, 63
308 ; CHECK-NEXT: ; kill: killed $vgpr2_vgpr3
309 ; CHECK-NEXT: s_and_b64 vcc, exec, 0
310 ; CHECK-NEXT: v_readlane_b32 s44, v4, 0
311 ; CHECK-NEXT: v_readlane_b32 s45, v4, 1
312 ; CHECK-NEXT: v_readlane_b32 s46, v4, 2
313 ; CHECK-NEXT: v_readlane_b32 s47, v4, 3
314 ; CHECK-NEXT: image_sample_lz v7, v[2:3], s[36:43], s[8:11] dmask:0x1
315 ; CHECK-NEXT: v_readlane_b32 s48, v4, 4
316 ; CHECK-NEXT: v_readlane_b32 s49, v4, 5
317 ; CHECK-NEXT: v_readlane_b32 s50, v4, 6
318 ; CHECK-NEXT: v_readlane_b32 s51, v4, 7
319 ; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
320 ; CHECK-NEXT: ; kill: killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43
321 ; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
322 ; CHECK-NEXT: s_waitcnt vmcnt(0)
323 ; CHECK-NEXT: v_sub_f32_e32 v2, v7, v6
324 ; CHECK-NEXT: v_mul_f32_e32 v0, v2, v0
325 ; CHECK-NEXT: v_mov_b32_e32 v2, 0
326 ; CHECK-NEXT: .LBB0_8: ; %bb33
327 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
328 ; CHECK-NEXT: v_add_f32_e32 v3, v2, v0
329 ; CHECK-NEXT: v_sub_f32_e32 v2, v2, v3
330 ; CHECK-NEXT: s_mov_b64 vcc, vcc
331 ; CHECK-NEXT: s_cbranch_vccz .LBB0_8
332 ; CHECK-NEXT: .LBB0_9: ; %Flow13
333 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
334 ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock
335 ; CHECK-NEXT: s_or_b64 exec, exec, s[20:21]
336 ; CHECK-NEXT: v_readlane_b32 s67, v1, 33
337 ; CHECK-NEXT: v_readlane_b32 s66, v1, 32
338 ; CHECK-NEXT: v_readlane_b32 s65, v1, 31
339 ; CHECK-NEXT: v_readlane_b32 s64, v1, 30
340 ; CHECK-NEXT: v_readlane_b32 s63, v1, 29
341 ; CHECK-NEXT: v_readlane_b32 s62, v1, 28
342 ; CHECK-NEXT: v_readlane_b32 s61, v1, 27
343 ; CHECK-NEXT: v_readlane_b32 s60, v1, 26
344 ; CHECK-NEXT: v_readlane_b32 s59, v1, 25
345 ; CHECK-NEXT: v_readlane_b32 s58, v1, 24
346 ; CHECK-NEXT: v_readlane_b32 s57, v1, 23
347 ; CHECK-NEXT: v_readlane_b32 s56, v1, 22
348 ; CHECK-NEXT: v_readlane_b32 s55, v1, 21
349 ; CHECK-NEXT: v_readlane_b32 s54, v1, 20
350 ; CHECK-NEXT: v_readlane_b32 s53, v1, 19
351 ; CHECK-NEXT: v_readlane_b32 s52, v1, 18
352 ; CHECK-NEXT: v_readlane_b32 s51, v1, 17
353 ; CHECK-NEXT: v_readlane_b32 s50, v1, 16
354 ; CHECK-NEXT: v_readlane_b32 s49, v1, 15
355 ; CHECK-NEXT: v_readlane_b32 s48, v1, 14
356 ; CHECK-NEXT: v_readlane_b32 s47, v1, 13
357 ; CHECK-NEXT: v_readlane_b32 s46, v1, 12
358 ; CHECK-NEXT: v_readlane_b32 s45, v1, 11
359 ; CHECK-NEXT: v_readlane_b32 s44, v1, 10
360 ; CHECK-NEXT: v_readlane_b32 s43, v1, 9
361 ; CHECK-NEXT: v_readlane_b32 s42, v1, 8
362 ; CHECK-NEXT: v_readlane_b32 s41, v1, 7
363 ; CHECK-NEXT: v_readlane_b32 s40, v1, 6
364 ; CHECK-NEXT: v_readlane_b32 s39, v1, 5
365 ; CHECK-NEXT: v_readlane_b32 s38, v1, 4
366 ; CHECK-NEXT: v_readlane_b32 s37, v1, 3
367 ; CHECK-NEXT: v_readlane_b32 s36, v1, 2
368 ; CHECK-NEXT: v_readlane_b32 s31, v1, 1
369 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0
370 ; CHECK-NEXT: ; kill: killed $vgpr5
371 ; CHECK-NEXT: ; kill: killed $vgpr4
372 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
373 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
374 ; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
375 ; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
376 ; CHECK-NEXT: s_mov_b64 exec, s[4:5]
377 ; CHECK-NEXT: s_waitcnt vmcnt(0)
378 ; CHECK-NEXT: s_setpc_b64 s[30:31]
380 %i = call i64 @llvm.amdgcn.s.getpc()
381 %i1 = trunc i64 %i to i32
382 %i2 = insertelement <2 x i32> zeroinitializer, i32 %i1, i64 1
383 %i3 = bitcast <2 x i32> %i2 to i64
384 %i4 = inttoptr i64 %i3 to ptr addrspace(4)
385 %i5 = getelementptr i8, ptr addrspace(4) %i4, i64 48
386 %i6 = load <4 x i32>, ptr addrspace(4) %i5, align 16
387 %i7 = getelementptr i8, ptr addrspace(4) %i4, i64 64
388 %i8 = load <4 x i32>, ptr addrspace(4) %i7, align 16
389 %i9 = getelementptr i8, ptr addrspace(4) %i4, i64 240
390 %i10 = load <8 x i32>, ptr addrspace(4) %i9, align 32
391 %i11 = getelementptr i8, ptr addrspace(4) %i4, i64 272
392 %i12 = load <8 x i32>, ptr addrspace(4) %i11, align 32
393 %i13 = getelementptr i8, ptr addrspace(4) %i4, i64 304
394 %i14 = load <8 x i32>, ptr addrspace(4) %i13, align 32
395 %i15 = getelementptr i8, ptr addrspace(4) %i4, i64 336
396 %i16 = load <8 x i32>, ptr addrspace(4) %i15, align 32
397 %i17 = getelementptr i8, ptr addrspace(4) %i4, i64 496
398 %i18 = load <8 x i32>, ptr addrspace(4) %i17, align 32
399 %i19 = getelementptr i8, ptr addrspace(4) %i4, i64 528
400 %i20 = load <8 x i32>, ptr addrspace(4) %i19, align 32
401 %i21 = getelementptr i8, ptr addrspace(4) %i4, i64 752
402 %i22 = load <8 x i32>, ptr addrspace(4) %i21, align 32
403 %i23 = getelementptr i8, ptr addrspace(4) %i4, i64 784
404 %i24 = load <8 x i32>, ptr addrspace(4) %i23, align 32
405 %i25 = load <4 x float>, ptr addrspace(4) null, align 16
406 %i26 = extractelement <4 x float> %i25, i64 0
407 %i27 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float %i26, float 0.000000e+00, <8 x i32> %i12, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
408 %i28 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i14, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
409 %i29 = extractelement <4 x float> %i28, i64 0
410 %i30 = fmul float %i29, %i27
411 %i31 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i16, <4 x i32> %i6, i1 false, i32 0, i32 0)
412 br i1 %arg, label %bb32, label %bb48
415 br i1 %arg, label %bb33, label %bb43
417 bb33: ; preds = %bb33, %bb32
418 %i34 = phi float [ %i42, %bb33 ], [ 0.000000e+00, %bb32 ]
419 %i35 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i18, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
420 %i36 = extractelement <2 x float> %i35, i64 0
421 %i37 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i22, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
422 %i38 = extractelement <2 x float> %i37, i64 0
423 %i39 = fsub float %i38, %i36
424 %i40 = fmul float %i39, %i30
425 %i41 = fadd float %i34, %i40
426 %i42 = fsub float %i34, %i41
429 bb43: ; preds = %bb32
430 %i44 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i10, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
431 %i45 = bitcast float %i44 to i32
432 %i46 = insertelement <3 x i32> zeroinitializer, i32 %i45, i64 0
433 call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> %i46, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
434 %i47 = bitcast <4 x float> %i31 to <4 x i32>
435 call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i47, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
439 %i49 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i12, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
442 bb50: ; preds = %bb50, %bb48
443 %i51 = phi float [ 0.000000e+00, %bb48 ], [ %i58, %bb50 ]
444 %i52 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float %i51, float 0.000000e+00, <8 x i32> %i20, <4 x i32> %i8, i1 false, i32 0, i32 0)
445 %i53 = extractelement <2 x float> %i52, i64 0
446 %i54 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float %i51, float 0.000000e+00, <8 x i32> %i24, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
447 %i55 = extractelement <2 x float> %i54, i64 0
448 %i56 = fsub float %i55, %i53
449 %i57 = fmul float %i56, %i30
450 %i58 = fmul float %i57, %i49
454 declare i64 @llvm.amdgcn.s.getpc() #1
455 declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2
456 declare float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2
457 declare <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2
458 declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
459 declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
461 attributes #0 = { "amdgpu-waves-per-eu"="10,10" }
462 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
463 attributes #2 = { nocallback nofree nosync nounwind willreturn memory(read) }
464 attributes #3 = { nocallback nofree nosync nounwind willreturn memory(write) }