1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
4 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
6 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
8 ; RUN: llc -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_SI %s
9 ; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX7 %s
10 ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=G_VI %s
11 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=G_GFX9 %s
12 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX10 %s
14 declare float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
15 declare float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
16 declare double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* nocapture, double, i32, i32, i1)
17 declare double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* nocapture, double, i32, i32, i1)
20 define amdgpu_kernel void @lds_ds_fmin(float addrspace(5)* %out, float addrspace(3)* %ptrf, i32 %idx) {
21 ; SI-LABEL: lds_ds_fmin:
23 ; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
24 ; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
25 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
26 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
27 ; SI-NEXT: s_mov_b32 s6, -1
28 ; SI-NEXT: s_mov_b32 s7, 0xe8f000
29 ; SI-NEXT: s_add_u32 s4, s4, s3
30 ; SI-NEXT: s_addc_u32 s5, s5, 0
31 ; SI-NEXT: s_waitcnt lgkmcnt(0)
32 ; SI-NEXT: s_lshl_b32 s3, s2, 4
33 ; SI-NEXT: s_lshl_b32 s2, s2, 3
34 ; SI-NEXT: s_add_i32 s2, s2, 32
35 ; SI-NEXT: v_mov_b32_e32 v0, 0x42280000
36 ; SI-NEXT: v_mov_b32_e32 v1, s2
37 ; SI-NEXT: s_mov_b32 m0, -1
38 ; SI-NEXT: ds_min_rtn_f32 v1, v1, v0
39 ; SI-NEXT: s_add_i32 s2, s3, 64
40 ; SI-NEXT: v_mov_b32_e32 v2, s2
41 ; SI-NEXT: ds_min_f32 v2, v0
42 ; SI-NEXT: v_mov_b32_e32 v0, s1
43 ; SI-NEXT: s_waitcnt lgkmcnt(1)
44 ; SI-NEXT: ds_min_rtn_f32 v0, v0, v1
45 ; SI-NEXT: v_mov_b32_e32 v1, s0
46 ; SI-NEXT: s_waitcnt lgkmcnt(0)
47 ; SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
50 ; GFX7-LABEL: lds_ds_fmin:
52 ; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
53 ; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
54 ; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb
55 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
56 ; GFX7-NEXT: s_mov_b32 s6, -1
57 ; GFX7-NEXT: s_mov_b32 s7, 0xe8f000
58 ; GFX7-NEXT: s_add_u32 s4, s4, s3
59 ; GFX7-NEXT: s_addc_u32 s5, s5, 0
60 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
61 ; GFX7-NEXT: s_lshl_b32 s3, s2, 3
62 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000
63 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
64 ; GFX7-NEXT: s_mov_b32 m0, -1
65 ; GFX7-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32
66 ; GFX7-NEXT: s_lshl_b32 s2, s2, 4
67 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
68 ; GFX7-NEXT: ds_min_f32 v2, v0 offset:64
69 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
70 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
71 ; GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1
72 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
73 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
74 ; GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
77 ; VI-LABEL: lds_ds_fmin:
79 ; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
80 ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
81 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
82 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
83 ; VI-NEXT: s_mov_b32 s90, -1
84 ; VI-NEXT: s_mov_b32 s91, 0xe80000
85 ; VI-NEXT: s_add_u32 s88, s88, s3
86 ; VI-NEXT: s_addc_u32 s89, s89, 0
87 ; VI-NEXT: s_waitcnt lgkmcnt(0)
88 ; VI-NEXT: s_lshl_b32 s3, s2, 3
89 ; VI-NEXT: v_mov_b32_e32 v0, 0x42280000
90 ; VI-NEXT: v_mov_b32_e32 v1, s3
91 ; VI-NEXT: s_mov_b32 m0, -1
92 ; VI-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32
93 ; VI-NEXT: s_lshl_b32 s2, s2, 4
94 ; VI-NEXT: v_mov_b32_e32 v2, s2
95 ; VI-NEXT: ds_min_f32 v2, v0 offset:64
96 ; VI-NEXT: v_mov_b32_e32 v0, s1
97 ; VI-NEXT: s_waitcnt lgkmcnt(1)
98 ; VI-NEXT: ds_min_rtn_f32 v0, v0, v1
99 ; VI-NEXT: v_mov_b32_e32 v1, s0
100 ; VI-NEXT: s_waitcnt lgkmcnt(0)
101 ; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
104 ; GFX9-LABEL: lds_ds_fmin:
106 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
107 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
108 ; GFX9-NEXT: s_mov_b32 s10, -1
109 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
110 ; GFX9-NEXT: s_add_u32 s8, s8, s3
111 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
112 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
113 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
114 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000
115 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
116 ; GFX9-NEXT: s_lshl_b32 s0, s4, 3
117 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
118 ; GFX9-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32
119 ; GFX9-NEXT: s_lshl_b32 s0, s4, 4
120 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
121 ; GFX9-NEXT: ds_min_f32 v2, v0 offset:64
122 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
123 ; GFX9-NEXT: s_waitcnt lgkmcnt(1)
124 ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1
125 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
126 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
127 ; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
128 ; GFX9-NEXT: s_endpgm
130 ; GFX10-LABEL: lds_ds_fmin:
132 ; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
133 ; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
134 ; GFX10-NEXT: s_mov_b32 s10, -1
135 ; GFX10-NEXT: s_mov_b32 s11, 0x31c16000
136 ; GFX10-NEXT: s_add_u32 s8, s8, s3
137 ; GFX10-NEXT: s_clause 0x1
138 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
139 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
140 ; GFX10-NEXT: s_addc_u32 s9, s9, 0
141 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x42280000
142 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
143 ; GFX10-NEXT: s_lshl_b32 s0, s4, 3
144 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
145 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
146 ; GFX10-NEXT: s_lshl_b32 s0, s4, 4
147 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
148 ; GFX10-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32
149 ; GFX10-NEXT: ds_min_f32 v2, v0 offset:64
150 ; GFX10-NEXT: s_waitcnt lgkmcnt(1)
151 ; GFX10-NEXT: ds_min_rtn_f32 v0, v3, v1
152 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
153 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
155 ; GFX10-NEXT: s_endpgm
157 ; G_SI-LABEL: lds_ds_fmin:
159 ; G_SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
160 ; G_SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
161 ; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb
162 ; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
163 ; G_SI-NEXT: s_mov_b32 s6, -1
164 ; G_SI-NEXT: s_mov_b32 s7, 0xe8f000
165 ; G_SI-NEXT: s_add_u32 s4, s4, s3
166 ; G_SI-NEXT: s_addc_u32 s5, s5, 0
167 ; G_SI-NEXT: s_waitcnt lgkmcnt(0)
168 ; G_SI-NEXT: s_add_i32 s2, s2, 4
169 ; G_SI-NEXT: s_lshl_b32 s3, s2, 3
170 ; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000
171 ; G_SI-NEXT: v_mov_b32_e32 v1, s3
172 ; G_SI-NEXT: s_mov_b32 m0, -1
173 ; G_SI-NEXT: ds_min_rtn_f32 v1, v1, v0
174 ; G_SI-NEXT: s_lshl_b32 s2, s2, 4
175 ; G_SI-NEXT: v_mov_b32_e32 v2, s2
176 ; G_SI-NEXT: ds_min_f32 v2, v0
177 ; G_SI-NEXT: v_mov_b32_e32 v0, s1
178 ; G_SI-NEXT: s_waitcnt lgkmcnt(1)
179 ; G_SI-NEXT: ds_min_rtn_f32 v0, v0, v1
180 ; G_SI-NEXT: v_mov_b32_e32 v1, s0
181 ; G_SI-NEXT: s_waitcnt lgkmcnt(0)
182 ; G_SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
183 ; G_SI-NEXT: s_endpgm
185 ; G_GFX7-LABEL: lds_ds_fmin:
187 ; G_GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
188 ; G_GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
189 ; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb
190 ; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
191 ; G_GFX7-NEXT: s_mov_b32 s6, -1
192 ; G_GFX7-NEXT: s_mov_b32 s7, 0xe8f000
193 ; G_GFX7-NEXT: s_add_u32 s4, s4, s3
194 ; G_GFX7-NEXT: s_addc_u32 s5, s5, 0
195 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
196 ; G_GFX7-NEXT: s_add_i32 s2, s2, 4
197 ; G_GFX7-NEXT: s_lshl_b32 s3, s2, 3
198 ; G_GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000
199 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s3
200 ; G_GFX7-NEXT: s_mov_b32 m0, -1
201 ; G_GFX7-NEXT: ds_min_rtn_f32 v1, v1, v0
202 ; G_GFX7-NEXT: s_lshl_b32 s2, s2, 4
203 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s2
204 ; G_GFX7-NEXT: ds_min_f32 v2, v0
205 ; G_GFX7-NEXT: v_mov_b32_e32 v0, s1
206 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(1)
207 ; G_GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1
208 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0
209 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
210 ; G_GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
211 ; G_GFX7-NEXT: s_endpgm
213 ; G_VI-LABEL: lds_ds_fmin:
215 ; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
216 ; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
217 ; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c
218 ; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
219 ; G_VI-NEXT: s_mov_b32 s90, -1
220 ; G_VI-NEXT: s_mov_b32 s91, 0xe80000
221 ; G_VI-NEXT: s_add_u32 s88, s88, s3
222 ; G_VI-NEXT: s_addc_u32 s89, s89, 0
223 ; G_VI-NEXT: s_waitcnt lgkmcnt(0)
224 ; G_VI-NEXT: s_add_i32 s2, s2, 4
225 ; G_VI-NEXT: s_lshl_b32 s3, s2, 3
226 ; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000
227 ; G_VI-NEXT: v_mov_b32_e32 v1, s3
228 ; G_VI-NEXT: s_mov_b32 m0, -1
229 ; G_VI-NEXT: ds_min_rtn_f32 v1, v1, v0
230 ; G_VI-NEXT: s_lshl_b32 s2, s2, 4
231 ; G_VI-NEXT: v_mov_b32_e32 v2, s2
232 ; G_VI-NEXT: ds_min_f32 v2, v0
233 ; G_VI-NEXT: v_mov_b32_e32 v0, s1
234 ; G_VI-NEXT: s_waitcnt lgkmcnt(1)
235 ; G_VI-NEXT: ds_min_rtn_f32 v0, v0, v1
236 ; G_VI-NEXT: v_mov_b32_e32 v1, s0
237 ; G_VI-NEXT: s_waitcnt lgkmcnt(0)
238 ; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
239 ; G_VI-NEXT: s_endpgm
241 ; G_GFX9-LABEL: lds_ds_fmin:
243 ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
244 ; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
245 ; G_GFX9-NEXT: s_mov_b32 s10, -1
246 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000
247 ; G_GFX9-NEXT: s_add_u32 s8, s8, s3
248 ; G_GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
249 ; G_GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
250 ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0
251 ; G_GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000
252 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
253 ; G_GFX9-NEXT: s_add_i32 s4, s4, 4
254 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3
255 ; G_GFX9-NEXT: v_mov_b32_e32 v0, s0
256 ; G_GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1
257 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4
258 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s0
259 ; G_GFX9-NEXT: ds_min_f32 v2, v1
260 ; G_GFX9-NEXT: v_mov_b32_e32 v1, s3
261 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(1)
262 ; G_GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0
263 ; G_GFX9-NEXT: v_mov_b32_e32 v1, s2
264 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
265 ; G_GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
266 ; G_GFX9-NEXT: s_endpgm
268 ; G_GFX10-LABEL: lds_ds_fmin:
270 ; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
271 ; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
272 ; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
273 ; G_GFX10-NEXT: s_mov_b32 s6, -1
274 ; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000
275 ; G_GFX10-NEXT: s_add_u32 s4, s4, s3
276 ; G_GFX10-NEXT: s_addc_u32 s5, s5, 0
277 ; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
278 ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000
279 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
280 ; G_GFX10-NEXT: s_add_i32 s2, s2, 4
281 ; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3
282 ; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4
283 ; G_GFX10-NEXT: v_mov_b32_e32 v0, s3
284 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
285 ; G_GFX10-NEXT: v_mov_b32_e32 v3, s1
286 ; G_GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1
287 ; G_GFX10-NEXT: ds_min_f32 v2, v1
288 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(1)
289 ; G_GFX10-NEXT: ds_min_rtn_f32 v0, v3, v0
290 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0
291 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
292 ; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
293 ; G_GFX10-NEXT: s_endpgm
294 %idx.add = add nuw i32 %idx, 4
295 %shl0 = shl i32 %idx.add, 3
296 %shl1 = shl i32 %idx.add, 4
297 %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
298 %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
299 %a1 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
300 %a2 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
301 %a3 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
302 store float %a3, float addrspace(5)* %out
306 define amdgpu_kernel void @lds_ds_fmax(float addrspace(5)* %out, float addrspace(3)* %ptrf, i32 %idx) {
307 ; SI-LABEL: lds_ds_fmax:
309 ; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
310 ; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
311 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
312 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
313 ; SI-NEXT: s_mov_b32 s6, -1
314 ; SI-NEXT: s_mov_b32 s7, 0xe8f000
315 ; SI-NEXT: s_add_u32 s4, s4, s3
316 ; SI-NEXT: s_addc_u32 s5, s5, 0
317 ; SI-NEXT: s_waitcnt lgkmcnt(0)
318 ; SI-NEXT: s_lshl_b32 s3, s2, 4
319 ; SI-NEXT: s_lshl_b32 s2, s2, 3
320 ; SI-NEXT: s_add_i32 s2, s2, 32
321 ; SI-NEXT: v_mov_b32_e32 v0, 0x42280000
322 ; SI-NEXT: v_mov_b32_e32 v1, s2
323 ; SI-NEXT: s_mov_b32 m0, -1
324 ; SI-NEXT: ds_max_rtn_f32 v1, v1, v0
325 ; SI-NEXT: s_add_i32 s2, s3, 64
326 ; SI-NEXT: v_mov_b32_e32 v2, s2
327 ; SI-NEXT: ds_max_f32 v2, v0
328 ; SI-NEXT: v_mov_b32_e32 v0, s1
329 ; SI-NEXT: s_waitcnt lgkmcnt(1)
330 ; SI-NEXT: ds_max_rtn_f32 v0, v0, v1
331 ; SI-NEXT: v_mov_b32_e32 v1, s0
332 ; SI-NEXT: s_waitcnt lgkmcnt(0)
333 ; SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
336 ; GFX7-LABEL: lds_ds_fmax:
338 ; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
339 ; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
340 ; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb
341 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
342 ; GFX7-NEXT: s_mov_b32 s6, -1
343 ; GFX7-NEXT: s_mov_b32 s7, 0xe8f000
344 ; GFX7-NEXT: s_add_u32 s4, s4, s3
345 ; GFX7-NEXT: s_addc_u32 s5, s5, 0
346 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX7-NEXT: s_lshl_b32 s3, s2, 3
348 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000
349 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
350 ; GFX7-NEXT: s_mov_b32 m0, -1
351 ; GFX7-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32
352 ; GFX7-NEXT: s_lshl_b32 s2, s2, 4
353 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
354 ; GFX7-NEXT: ds_max_f32 v2, v0 offset:64
355 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
356 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
357 ; GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1
358 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
359 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
360 ; GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
361 ; GFX7-NEXT: s_endpgm
363 ; VI-LABEL: lds_ds_fmax:
365 ; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
366 ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
367 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
368 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
369 ; VI-NEXT: s_mov_b32 s90, -1
370 ; VI-NEXT: s_mov_b32 s91, 0xe80000
371 ; VI-NEXT: s_add_u32 s88, s88, s3
372 ; VI-NEXT: s_addc_u32 s89, s89, 0
373 ; VI-NEXT: s_waitcnt lgkmcnt(0)
374 ; VI-NEXT: s_lshl_b32 s3, s2, 3
375 ; VI-NEXT: v_mov_b32_e32 v0, 0x42280000
376 ; VI-NEXT: v_mov_b32_e32 v1, s3
377 ; VI-NEXT: s_mov_b32 m0, -1
378 ; VI-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32
379 ; VI-NEXT: s_lshl_b32 s2, s2, 4
380 ; VI-NEXT: v_mov_b32_e32 v2, s2
381 ; VI-NEXT: ds_max_f32 v2, v0 offset:64
382 ; VI-NEXT: v_mov_b32_e32 v0, s1
383 ; VI-NEXT: s_waitcnt lgkmcnt(1)
384 ; VI-NEXT: ds_max_rtn_f32 v0, v0, v1
385 ; VI-NEXT: v_mov_b32_e32 v1, s0
386 ; VI-NEXT: s_waitcnt lgkmcnt(0)
387 ; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
390 ; GFX9-LABEL: lds_ds_fmax:
392 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
393 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
394 ; GFX9-NEXT: s_mov_b32 s10, -1
395 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
396 ; GFX9-NEXT: s_add_u32 s8, s8, s3
397 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
398 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
399 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000
401 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX9-NEXT: s_lshl_b32 s0, s4, 3
403 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
404 ; GFX9-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32
405 ; GFX9-NEXT: s_lshl_b32 s0, s4, 4
406 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
407 ; GFX9-NEXT: ds_max_f32 v2, v0 offset:64
408 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
409 ; GFX9-NEXT: s_waitcnt lgkmcnt(1)
410 ; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1
411 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
412 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
413 ; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
414 ; GFX9-NEXT: s_endpgm
416 ; GFX10-LABEL: lds_ds_fmax:
418 ; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
419 ; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
420 ; GFX10-NEXT: s_mov_b32 s10, -1
421 ; GFX10-NEXT: s_mov_b32 s11, 0x31c16000
422 ; GFX10-NEXT: s_add_u32 s8, s8, s3
423 ; GFX10-NEXT: s_clause 0x1
424 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
425 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
426 ; GFX10-NEXT: s_addc_u32 s9, s9, 0
427 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x42280000
428 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
429 ; GFX10-NEXT: s_lshl_b32 s0, s4, 3
430 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
431 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
432 ; GFX10-NEXT: s_lshl_b32 s0, s4, 4
433 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
434 ; GFX10-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32
435 ; GFX10-NEXT: ds_max_f32 v2, v0 offset:64
436 ; GFX10-NEXT: s_waitcnt lgkmcnt(1)
437 ; GFX10-NEXT: ds_max_rtn_f32 v0, v3, v1
438 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
439 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
440 ; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
441 ; GFX10-NEXT: s_endpgm
443 ; G_SI-LABEL: lds_ds_fmax:
445 ; G_SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
446 ; G_SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
447 ; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb
448 ; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
449 ; G_SI-NEXT: s_mov_b32 s6, -1
450 ; G_SI-NEXT: s_mov_b32 s7, 0xe8f000
451 ; G_SI-NEXT: s_add_u32 s4, s4, s3
452 ; G_SI-NEXT: s_addc_u32 s5, s5, 0
453 ; G_SI-NEXT: s_waitcnt lgkmcnt(0)
454 ; G_SI-NEXT: s_add_i32 s2, s2, 4
455 ; G_SI-NEXT: s_lshl_b32 s3, s2, 3
456 ; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000
457 ; G_SI-NEXT: v_mov_b32_e32 v1, s3
458 ; G_SI-NEXT: s_mov_b32 m0, -1
459 ; G_SI-NEXT: ds_max_rtn_f32 v1, v1, v0
460 ; G_SI-NEXT: s_lshl_b32 s2, s2, 4
461 ; G_SI-NEXT: v_mov_b32_e32 v2, s2
462 ; G_SI-NEXT: ds_max_f32 v2, v0
463 ; G_SI-NEXT: v_mov_b32_e32 v0, s1
464 ; G_SI-NEXT: s_waitcnt lgkmcnt(1)
465 ; G_SI-NEXT: ds_max_rtn_f32 v0, v0, v1
466 ; G_SI-NEXT: v_mov_b32_e32 v1, s0
467 ; G_SI-NEXT: s_waitcnt lgkmcnt(0)
468 ; G_SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
469 ; G_SI-NEXT: s_endpgm
471 ; G_GFX7-LABEL: lds_ds_fmax:
473 ; G_GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
474 ; G_GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
475 ; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb
476 ; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
477 ; G_GFX7-NEXT: s_mov_b32 s6, -1
478 ; G_GFX7-NEXT: s_mov_b32 s7, 0xe8f000
479 ; G_GFX7-NEXT: s_add_u32 s4, s4, s3
480 ; G_GFX7-NEXT: s_addc_u32 s5, s5, 0
481 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
482 ; G_GFX7-NEXT: s_add_i32 s2, s2, 4
483 ; G_GFX7-NEXT: s_lshl_b32 s3, s2, 3
484 ; G_GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000
485 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s3
486 ; G_GFX7-NEXT: s_mov_b32 m0, -1
487 ; G_GFX7-NEXT: ds_max_rtn_f32 v1, v1, v0
488 ; G_GFX7-NEXT: s_lshl_b32 s2, s2, 4
489 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s2
490 ; G_GFX7-NEXT: ds_max_f32 v2, v0
491 ; G_GFX7-NEXT: v_mov_b32_e32 v0, s1
492 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(1)
493 ; G_GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1
494 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0
495 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
496 ; G_GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
497 ; G_GFX7-NEXT: s_endpgm
499 ; G_VI-LABEL: lds_ds_fmax:
501 ; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
502 ; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
503 ; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c
504 ; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
505 ; G_VI-NEXT: s_mov_b32 s90, -1
506 ; G_VI-NEXT: s_mov_b32 s91, 0xe80000
507 ; G_VI-NEXT: s_add_u32 s88, s88, s3
508 ; G_VI-NEXT: s_addc_u32 s89, s89, 0
509 ; G_VI-NEXT: s_waitcnt lgkmcnt(0)
510 ; G_VI-NEXT: s_add_i32 s2, s2, 4
511 ; G_VI-NEXT: s_lshl_b32 s3, s2, 3
512 ; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000
513 ; G_VI-NEXT: v_mov_b32_e32 v1, s3
514 ; G_VI-NEXT: s_mov_b32 m0, -1
515 ; G_VI-NEXT: ds_max_rtn_f32 v1, v1, v0
516 ; G_VI-NEXT: s_lshl_b32 s2, s2, 4
517 ; G_VI-NEXT: v_mov_b32_e32 v2, s2
518 ; G_VI-NEXT: ds_max_f32 v2, v0
519 ; G_VI-NEXT: v_mov_b32_e32 v0, s1
520 ; G_VI-NEXT: s_waitcnt lgkmcnt(1)
521 ; G_VI-NEXT: ds_max_rtn_f32 v0, v0, v1
522 ; G_VI-NEXT: v_mov_b32_e32 v1, s0
523 ; G_VI-NEXT: s_waitcnt lgkmcnt(0)
524 ; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
525 ; G_VI-NEXT: s_endpgm
527 ; G_GFX9-LABEL: lds_ds_fmax:
529 ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
530 ; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
531 ; G_GFX9-NEXT: s_mov_b32 s10, -1
532 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000
533 ; G_GFX9-NEXT: s_add_u32 s8, s8, s3
534 ; G_GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
535 ; G_GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
536 ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0
537 ; G_GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000
538 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
539 ; G_GFX9-NEXT: s_add_i32 s4, s4, 4
540 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3
541 ; G_GFX9-NEXT: v_mov_b32_e32 v0, s0
542 ; G_GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1
543 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4
544 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s0
545 ; G_GFX9-NEXT: ds_max_f32 v2, v1
546 ; G_GFX9-NEXT: v_mov_b32_e32 v1, s3
547 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(1)
548 ; G_GFX9-NEXT: ds_max_rtn_f32 v0, v1, v0
549 ; G_GFX9-NEXT: v_mov_b32_e32 v1, s2
550 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
551 ; G_GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
552 ; G_GFX9-NEXT: s_endpgm
554 ; G_GFX10-LABEL: lds_ds_fmax:
556 ; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
557 ; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
558 ; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
559 ; G_GFX10-NEXT: s_mov_b32 s6, -1
560 ; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000
561 ; G_GFX10-NEXT: s_add_u32 s4, s4, s3
562 ; G_GFX10-NEXT: s_addc_u32 s5, s5, 0
563 ; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
564 ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000
565 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
566 ; G_GFX10-NEXT: s_add_i32 s2, s2, 4
567 ; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3
568 ; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4
569 ; G_GFX10-NEXT: v_mov_b32_e32 v0, s3
570 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
571 ; G_GFX10-NEXT: v_mov_b32_e32 v3, s1
572 ; G_GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1
573 ; G_GFX10-NEXT: ds_max_f32 v2, v1
574 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(1)
575 ; G_GFX10-NEXT: ds_max_rtn_f32 v0, v3, v0
576 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0
577 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
578 ; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
579 ; G_GFX10-NEXT: s_endpgm
580 %idx.add = add nuw i32 %idx, 4
581 %shl0 = shl i32 %idx.add, 3
582 %shl1 = shl i32 %idx.add, 4
583 %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
584 %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
585 %a1 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
586 %a2 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
587 %a3 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
588 store float %a3, float addrspace(5)* %out
592 define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double addrspace(3)* %ptrf, i32 %idx) {
593 ; SI-LABEL: lds_ds_fmin_f64:
595 ; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
596 ; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
597 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
598 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
599 ; SI-NEXT: s_mov_b32 s10, -1
600 ; SI-NEXT: s_mov_b32 s11, 0xe8f000
601 ; SI-NEXT: s_add_u32 s8, s8, s3
602 ; SI-NEXT: s_addc_u32 s9, s9, 0
603 ; SI-NEXT: s_mov_b32 s2, 0
604 ; SI-NEXT: s_waitcnt lgkmcnt(0)
605 ; SI-NEXT: s_lshl_b32 s5, s4, 4
606 ; SI-NEXT: s_lshl_b32 s4, s4, 3
607 ; SI-NEXT: s_mov_b32 s3, 0x40450000
608 ; SI-NEXT: s_add_i32 s4, s4, 32
609 ; SI-NEXT: v_mov_b32_e32 v0, s2
610 ; SI-NEXT: v_mov_b32_e32 v2, s4
611 ; SI-NEXT: v_mov_b32_e32 v1, s3
612 ; SI-NEXT: s_mov_b32 m0, -1
613 ; SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
614 ; SI-NEXT: s_add_i32 s2, s5, 64
615 ; SI-NEXT: v_mov_b32_e32 v4, s2
616 ; SI-NEXT: ds_min_f64 v4, v[0:1]
617 ; SI-NEXT: v_mov_b32_e32 v0, s1
618 ; SI-NEXT: s_waitcnt lgkmcnt(1)
619 ; SI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3]
620 ; SI-NEXT: s_add_i32 s1, s0, 4
621 ; SI-NEXT: v_mov_b32_e32 v2, s1
622 ; SI-NEXT: s_waitcnt lgkmcnt(0)
623 ; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen
624 ; SI-NEXT: s_waitcnt expcnt(0)
625 ; SI-NEXT: v_mov_b32_e32 v1, s0
626 ; SI-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
629 ; GFX7-LABEL: lds_ds_fmin_f64:
631 ; GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
632 ; GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
633 ; GFX7-NEXT: s_mov_b32 s10, -1
634 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0xb
635 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
636 ; GFX7-NEXT: s_mov_b32 s11, 0xe8f000
637 ; GFX7-NEXT: s_add_u32 s8, s8, s3
638 ; GFX7-NEXT: s_mov_b32 s2, 0
639 ; GFX7-NEXT: s_mov_b32 s3, 0x40450000
640 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
641 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
642 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
643 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
644 ; GFX7-NEXT: s_lshl_b32 s2, s4, 3
645 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
646 ; GFX7-NEXT: s_mov_b32 m0, -1
647 ; GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
648 ; GFX7-NEXT: s_lshl_b32 s2, s4, 4
649 ; GFX7-NEXT: v_mov_b32_e32 v4, s2
650 ; GFX7-NEXT: ds_min_f64 v4, v[0:1] offset:64
651 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
652 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
653 ; GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3]
654 ; GFX7-NEXT: s_add_i32 s1, s0, 4
655 ; GFX7-NEXT: v_mov_b32_e32 v3, s1
656 ; GFX7-NEXT: v_mov_b32_e32 v2, s0
657 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
658 ; GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen
659 ; GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
660 ; GFX7-NEXT: s_endpgm
662 ; VI-LABEL: lds_ds_fmin_f64:
664 ; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
665 ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
666 ; VI-NEXT: s_mov_b32 s90, -1
667 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
668 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
669 ; VI-NEXT: s_mov_b32 s91, 0xe80000
670 ; VI-NEXT: s_add_u32 s88, s88, s3
671 ; VI-NEXT: s_mov_b32 s2, 0
672 ; VI-NEXT: s_mov_b32 s3, 0x40450000
673 ; VI-NEXT: v_mov_b32_e32 v0, s2
674 ; VI-NEXT: s_addc_u32 s89, s89, 0
675 ; VI-NEXT: v_mov_b32_e32 v1, s3
676 ; VI-NEXT: s_waitcnt lgkmcnt(0)
677 ; VI-NEXT: s_lshl_b32 s2, s4, 3
678 ; VI-NEXT: v_mov_b32_e32 v2, s2
679 ; VI-NEXT: s_mov_b32 m0, -1
680 ; VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
681 ; VI-NEXT: s_lshl_b32 s2, s4, 4
682 ; VI-NEXT: v_mov_b32_e32 v4, s2
683 ; VI-NEXT: ds_min_f64 v4, v[0:1] offset:64
684 ; VI-NEXT: v_mov_b32_e32 v0, s1
685 ; VI-NEXT: s_waitcnt lgkmcnt(1)
686 ; VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3]
687 ; VI-NEXT: s_add_i32 s1, s0, 4
688 ; VI-NEXT: v_mov_b32_e32 v3, s1
689 ; VI-NEXT: v_mov_b32_e32 v2, s0
690 ; VI-NEXT: s_waitcnt lgkmcnt(0)
691 ; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
692 ; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
695 ; GFX9-LABEL: lds_ds_fmin_f64:
697 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
698 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
699 ; GFX9-NEXT: s_mov_b32 s10, -1
700 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
701 ; GFX9-NEXT: s_add_u32 s8, s8, s3
702 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
703 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
704 ; GFX9-NEXT: s_mov_b32 s0, 0
705 ; GFX9-NEXT: s_mov_b32 s1, 0x40450000
706 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
707 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
708 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
709 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
710 ; GFX9-NEXT: s_lshl_b32 s0, s4, 3
711 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
712 ; GFX9-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
713 ; GFX9-NEXT: s_lshl_b32 s0, s4, 4
714 ; GFX9-NEXT: v_mov_b32_e32 v5, s0
715 ; GFX9-NEXT: v_mov_b32_e32 v4, s3
716 ; GFX9-NEXT: ds_min_f64 v5, v[0:1] offset:64
717 ; GFX9-NEXT: s_waitcnt lgkmcnt(1)
718 ; GFX9-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3]
719 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
720 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
721 ; GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
722 ; GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
723 ; GFX9-NEXT: s_endpgm
725 ; GFX10-LABEL: lds_ds_fmin_f64:
727 ; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
728 ; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
729 ; GFX10-NEXT: s_mov_b32 s10, -1
730 ; GFX10-NEXT: s_mov_b32 s11, 0x31c16000
731 ; GFX10-NEXT: s_add_u32 s8, s8, s3
732 ; GFX10-NEXT: s_clause 0x1
733 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
734 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
735 ; GFX10-NEXT: s_mov_b32 s0, 0
736 ; GFX10-NEXT: s_addc_u32 s9, s9, 0
737 ; GFX10-NEXT: s_mov_b32 s1, 0x40450000
738 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
739 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
740 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
741 ; GFX10-NEXT: s_lshl_b32 s5, s4, 3
742 ; GFX10-NEXT: s_lshl_b32 s0, s4, 4
743 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
744 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
745 ; GFX10-NEXT: v_mov_b32_e32 v5, s3
746 ; GFX10-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
747 ; GFX10-NEXT: ds_min_f64 v4, v[0:1] offset:64
748 ; GFX10-NEXT: s_waitcnt lgkmcnt(1)
749 ; GFX10-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3]
750 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
751 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
752 ; GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
753 ; GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
754 ; GFX10-NEXT: s_endpgm
756 ; G_SI-LABEL: lds_ds_fmin_f64:
758 ; G_SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
759 ; G_SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
760 ; G_SI-NEXT: s_load_dword s4, s[0:1], 0xb
761 ; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
762 ; G_SI-NEXT: s_mov_b32 s10, -1
763 ; G_SI-NEXT: s_mov_b32 s11, 0xe8f000
764 ; G_SI-NEXT: s_add_u32 s8, s8, s3
765 ; G_SI-NEXT: s_mov_b32 s2, 0
766 ; G_SI-NEXT: s_addc_u32 s9, s9, 0
767 ; G_SI-NEXT: s_mov_b32 s3, 0x40450000
768 ; G_SI-NEXT: v_mov_b32_e32 v0, s2
769 ; G_SI-NEXT: s_waitcnt lgkmcnt(0)
770 ; G_SI-NEXT: s_add_i32 s4, s4, 4
771 ; G_SI-NEXT: v_mov_b32_e32 v1, s3
772 ; G_SI-NEXT: s_lshl_b32 s2, s4, 3
773 ; G_SI-NEXT: v_mov_b32_e32 v2, s2
774 ; G_SI-NEXT: s_mov_b32 m0, -1
775 ; G_SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
776 ; G_SI-NEXT: s_lshl_b32 s2, s4, 4
777 ; G_SI-NEXT: v_mov_b32_e32 v4, s2
778 ; G_SI-NEXT: ds_min_f64 v4, v[0:1]
779 ; G_SI-NEXT: v_mov_b32_e32 v0, s1
780 ; G_SI-NEXT: s_waitcnt lgkmcnt(1)
781 ; G_SI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3]
782 ; G_SI-NEXT: v_mov_b32_e32 v2, s0
783 ; G_SI-NEXT: s_add_u32 s0, s0, 4
784 ; G_SI-NEXT: v_mov_b32_e32 v3, s0
785 ; G_SI-NEXT: s_waitcnt lgkmcnt(0)
786 ; G_SI-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
787 ; G_SI-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen
788 ; G_SI-NEXT: s_endpgm
790 ; G_GFX7-LABEL: lds_ds_fmin_f64:
792 ; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
793 ; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
794 ; G_GFX7-NEXT: s_load_dword s4, s[0:1], 0xb
795 ; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
796 ; G_GFX7-NEXT: s_mov_b32 s10, -1
797 ; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000
798 ; G_GFX7-NEXT: s_add_u32 s8, s8, s3
799 ; G_GFX7-NEXT: s_mov_b32 s2, 0
800 ; G_GFX7-NEXT: s_addc_u32 s9, s9, 0
801 ; G_GFX7-NEXT: s_mov_b32 s3, 0x40450000
802 ; G_GFX7-NEXT: v_mov_b32_e32 v0, s2
803 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
804 ; G_GFX7-NEXT: s_add_i32 s4, s4, 4
805 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s3
806 ; G_GFX7-NEXT: s_lshl_b32 s2, s4, 3
807 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s2
808 ; G_GFX7-NEXT: s_mov_b32 m0, -1
809 ; G_GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
810 ; G_GFX7-NEXT: s_lshl_b32 s2, s4, 4
811 ; G_GFX7-NEXT: v_mov_b32_e32 v4, s2
812 ; G_GFX7-NEXT: ds_min_f64 v4, v[0:1]
813 ; G_GFX7-NEXT: v_mov_b32_e32 v0, s1
814 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(1)
815 ; G_GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3]
816 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s0
817 ; G_GFX7-NEXT: s_add_u32 s0, s0, 4
818 ; G_GFX7-NEXT: v_mov_b32_e32 v3, s0
819 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
820 ; G_GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
821 ; G_GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen
822 ; G_GFX7-NEXT: s_endpgm
824 ; G_VI-LABEL: lds_ds_fmin_f64:
826 ; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
827 ; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
828 ; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
829 ; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
830 ; G_VI-NEXT: s_mov_b32 s90, -1
831 ; G_VI-NEXT: s_mov_b32 s91, 0xe80000
832 ; G_VI-NEXT: s_add_u32 s88, s88, s3
833 ; G_VI-NEXT: s_mov_b32 s2, 0
834 ; G_VI-NEXT: s_addc_u32 s89, s89, 0
835 ; G_VI-NEXT: s_mov_b32 s3, 0x40450000
836 ; G_VI-NEXT: v_mov_b32_e32 v0, s2
837 ; G_VI-NEXT: s_waitcnt lgkmcnt(0)
838 ; G_VI-NEXT: s_add_i32 s4, s4, 4
839 ; G_VI-NEXT: v_mov_b32_e32 v1, s3
840 ; G_VI-NEXT: s_lshl_b32 s2, s4, 3
841 ; G_VI-NEXT: v_mov_b32_e32 v2, s2
842 ; G_VI-NEXT: s_mov_b32 m0, -1
843 ; G_VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
844 ; G_VI-NEXT: s_lshl_b32 s2, s4, 4
845 ; G_VI-NEXT: v_mov_b32_e32 v4, s2
846 ; G_VI-NEXT: ds_min_f64 v4, v[0:1]
847 ; G_VI-NEXT: v_mov_b32_e32 v0, s1
848 ; G_VI-NEXT: s_waitcnt lgkmcnt(1)
849 ; G_VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3]
850 ; G_VI-NEXT: v_mov_b32_e32 v2, s0
851 ; G_VI-NEXT: s_add_u32 s0, s0, 4
852 ; G_VI-NEXT: v_mov_b32_e32 v3, s0
853 ; G_VI-NEXT: s_waitcnt lgkmcnt(0)
854 ; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
855 ; G_VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
856 ; G_VI-NEXT: s_endpgm
858 ; G_GFX9-LABEL: lds_ds_fmin_f64:
860 ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
861 ; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
862 ; G_GFX9-NEXT: s_mov_b32 s10, -1
863 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000
864 ; G_GFX9-NEXT: s_add_u32 s8, s8, s3
865 ; G_GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
866 ; G_GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
867 ; G_GFX9-NEXT: s_mov_b32 s0, 0
868 ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0
869 ; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000
870 ; G_GFX9-NEXT: v_mov_b32_e32 v0, s0
871 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
872 ; G_GFX9-NEXT: s_add_i32 s4, s4, 4
873 ; G_GFX9-NEXT: v_mov_b32_e32 v1, s1
874 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3
875 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s0
876 ; G_GFX9-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
877 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4
878 ; G_GFX9-NEXT: v_mov_b32_e32 v5, s0
879 ; G_GFX9-NEXT: v_mov_b32_e32 v4, s3
880 ; G_GFX9-NEXT: ds_min_f64 v5, v[0:1]
881 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(1)
882 ; G_GFX9-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3]
883 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s2
884 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
885 ; G_GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
886 ; G_GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
887 ; G_GFX9-NEXT: s_endpgm
889 ; G_GFX10-LABEL: lds_ds_fmin_f64:
891 ; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
892 ; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
893 ; G_GFX10-NEXT: s_mov_b32 s10, -1
894 ; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
895 ; G_GFX10-NEXT: s_add_u32 s8, s8, s3
896 ; G_GFX10-NEXT: s_clause 0x1
897 ; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
898 ; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
899 ; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
900 ; G_GFX10-NEXT: s_mov_b32 s0, 0
901 ; G_GFX10-NEXT: s_mov_b32 s1, 0x40450000
902 ; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
903 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
904 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
905 ; G_GFX10-NEXT: s_add_i32 s4, s4, 4
906 ; G_GFX10-NEXT: v_mov_b32_e32 v5, s3
907 ; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3
908 ; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4
909 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s5
910 ; G_GFX10-NEXT: v_mov_b32_e32 v4, s0
911 ; G_GFX10-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
912 ; G_GFX10-NEXT: ds_min_f64 v4, v[0:1]
913 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(1)
914 ; G_GFX10-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3]
915 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
916 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
917 ; G_GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
918 ; G_GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
919 ; G_GFX10-NEXT: s_endpgm
920 %idx.add = add nuw i32 %idx, 4
921 %shl0 = shl i32 %idx.add, 3
922 %shl1 = shl i32 %idx.add, 4
923 %ptr0 = inttoptr i32 %shl0 to double addrspace(3)*
924 %ptr1 = inttoptr i32 %shl1 to double addrspace(3)*
925 %a1 = call double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* %ptr0, double 4.2e+1, i32 0, i32 0, i1 false)
926 %a2 = call double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* %ptr1, double 4.2e+1, i32 0, i32 0, i1 false)
927 %a3 = call double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* %ptrf, double %a1, i32 0, i32 0, i1 false)
928 store double %a3, double addrspace(5)* %out
932 define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double addrspace(3)* %ptrf, i32 %idx) {
933 ; SI-LABEL: lds_ds_fmax_f64:
935 ; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
936 ; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
937 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
938 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
939 ; SI-NEXT: s_mov_b32 s10, -1
940 ; SI-NEXT: s_mov_b32 s11, 0xe8f000
941 ; SI-NEXT: s_add_u32 s8, s8, s3
942 ; SI-NEXT: s_addc_u32 s9, s9, 0
943 ; SI-NEXT: s_mov_b32 s2, 0
944 ; SI-NEXT: s_waitcnt lgkmcnt(0)
945 ; SI-NEXT: s_lshl_b32 s5, s4, 4
946 ; SI-NEXT: s_lshl_b32 s4, s4, 3
947 ; SI-NEXT: s_mov_b32 s3, 0x40450000
948 ; SI-NEXT: s_add_i32 s4, s4, 32
949 ; SI-NEXT: v_mov_b32_e32 v0, s2
950 ; SI-NEXT: v_mov_b32_e32 v2, s4
951 ; SI-NEXT: v_mov_b32_e32 v1, s3
952 ; SI-NEXT: s_mov_b32 m0, -1
953 ; SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
954 ; SI-NEXT: s_add_i32 s2, s5, 64
955 ; SI-NEXT: v_mov_b32_e32 v4, s2
956 ; SI-NEXT: ds_max_f64 v4, v[0:1]
957 ; SI-NEXT: v_mov_b32_e32 v0, s1
958 ; SI-NEXT: s_waitcnt lgkmcnt(1)
959 ; SI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3]
960 ; SI-NEXT: s_add_i32 s1, s0, 4
961 ; SI-NEXT: v_mov_b32_e32 v2, s1
962 ; SI-NEXT: s_waitcnt lgkmcnt(0)
963 ; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen
964 ; SI-NEXT: s_waitcnt expcnt(0)
965 ; SI-NEXT: v_mov_b32_e32 v1, s0
966 ; SI-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
969 ; GFX7-LABEL: lds_ds_fmax_f64:
971 ; GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
972 ; GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
973 ; GFX7-NEXT: s_mov_b32 s10, -1
974 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0xb
975 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
976 ; GFX7-NEXT: s_mov_b32 s11, 0xe8f000
977 ; GFX7-NEXT: s_add_u32 s8, s8, s3
978 ; GFX7-NEXT: s_mov_b32 s2, 0
979 ; GFX7-NEXT: s_mov_b32 s3, 0x40450000
980 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
981 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
982 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
983 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
984 ; GFX7-NEXT: s_lshl_b32 s2, s4, 3
985 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
986 ; GFX7-NEXT: s_mov_b32 m0, -1
987 ; GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
988 ; GFX7-NEXT: s_lshl_b32 s2, s4, 4
989 ; GFX7-NEXT: v_mov_b32_e32 v4, s2
990 ; GFX7-NEXT: ds_max_f64 v4, v[0:1] offset:64
991 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
992 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
993 ; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3]
994 ; GFX7-NEXT: s_add_i32 s1, s0, 4
995 ; GFX7-NEXT: v_mov_b32_e32 v3, s1
996 ; GFX7-NEXT: v_mov_b32_e32 v2, s0
997 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
998 ; GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen
999 ; GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
1000 ; GFX7-NEXT: s_endpgm
1002 ; VI-LABEL: lds_ds_fmax_f64:
1004 ; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
1005 ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
1006 ; VI-NEXT: s_mov_b32 s90, -1
1007 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
1008 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1009 ; VI-NEXT: s_mov_b32 s91, 0xe80000
1010 ; VI-NEXT: s_add_u32 s88, s88, s3
1011 ; VI-NEXT: s_mov_b32 s2, 0
1012 ; VI-NEXT: s_mov_b32 s3, 0x40450000
1013 ; VI-NEXT: v_mov_b32_e32 v0, s2
1014 ; VI-NEXT: s_addc_u32 s89, s89, 0
1015 ; VI-NEXT: v_mov_b32_e32 v1, s3
1016 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1017 ; VI-NEXT: s_lshl_b32 s2, s4, 3
1018 ; VI-NEXT: v_mov_b32_e32 v2, s2
1019 ; VI-NEXT: s_mov_b32 m0, -1
1020 ; VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
1021 ; VI-NEXT: s_lshl_b32 s2, s4, 4
1022 ; VI-NEXT: v_mov_b32_e32 v4, s2
1023 ; VI-NEXT: ds_max_f64 v4, v[0:1] offset:64
1024 ; VI-NEXT: v_mov_b32_e32 v0, s1
1025 ; VI-NEXT: s_waitcnt lgkmcnt(1)
1026 ; VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3]
1027 ; VI-NEXT: s_add_i32 s1, s0, 4
1028 ; VI-NEXT: v_mov_b32_e32 v3, s1
1029 ; VI-NEXT: v_mov_b32_e32 v2, s0
1030 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1031 ; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
1032 ; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
1035 ; GFX9-LABEL: lds_ds_fmax_f64:
1037 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1038 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1039 ; GFX9-NEXT: s_mov_b32 s10, -1
1040 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1041 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1042 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
1043 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1044 ; GFX9-NEXT: s_mov_b32 s0, 0
1045 ; GFX9-NEXT: s_mov_b32 s1, 0x40450000
1046 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1047 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1048 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1049 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1050 ; GFX9-NEXT: s_lshl_b32 s0, s4, 3
1051 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
1052 ; GFX9-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
1053 ; GFX9-NEXT: s_lshl_b32 s0, s4, 4
1054 ; GFX9-NEXT: v_mov_b32_e32 v5, s0
1055 ; GFX9-NEXT: v_mov_b32_e32 v4, s3
1056 ; GFX9-NEXT: ds_max_f64 v5, v[0:1] offset:64
1057 ; GFX9-NEXT: s_waitcnt lgkmcnt(1)
1058 ; GFX9-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3]
1059 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1060 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1061 ; GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
1062 ; GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
1063 ; GFX9-NEXT: s_endpgm
1065 ; GFX10-LABEL: lds_ds_fmax_f64:
1067 ; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1068 ; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1069 ; GFX10-NEXT: s_mov_b32 s10, -1
1070 ; GFX10-NEXT: s_mov_b32 s11, 0x31c16000
1071 ; GFX10-NEXT: s_add_u32 s8, s8, s3
1072 ; GFX10-NEXT: s_clause 0x1
1073 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
1074 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1075 ; GFX10-NEXT: s_mov_b32 s0, 0
1076 ; GFX10-NEXT: s_addc_u32 s9, s9, 0
1077 ; GFX10-NEXT: s_mov_b32 s1, 0x40450000
1078 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1079 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1080 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1081 ; GFX10-NEXT: s_lshl_b32 s5, s4, 3
1082 ; GFX10-NEXT: s_lshl_b32 s0, s4, 4
1083 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
1084 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
1085 ; GFX10-NEXT: v_mov_b32_e32 v5, s3
1086 ; GFX10-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
1087 ; GFX10-NEXT: ds_max_f64 v4, v[0:1] offset:64
1088 ; GFX10-NEXT: s_waitcnt lgkmcnt(1)
1089 ; GFX10-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3]
1090 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
1091 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1092 ; GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
1093 ; GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
1094 ; GFX10-NEXT: s_endpgm
1096 ; G_SI-LABEL: lds_ds_fmax_f64:
1098 ; G_SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1099 ; G_SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1100 ; G_SI-NEXT: s_load_dword s4, s[0:1], 0xb
1101 ; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1102 ; G_SI-NEXT: s_mov_b32 s10, -1
1103 ; G_SI-NEXT: s_mov_b32 s11, 0xe8f000
1104 ; G_SI-NEXT: s_add_u32 s8, s8, s3
1105 ; G_SI-NEXT: s_mov_b32 s2, 0
1106 ; G_SI-NEXT: s_addc_u32 s9, s9, 0
1107 ; G_SI-NEXT: s_mov_b32 s3, 0x40450000
1108 ; G_SI-NEXT: v_mov_b32_e32 v0, s2
1109 ; G_SI-NEXT: s_waitcnt lgkmcnt(0)
1110 ; G_SI-NEXT: s_add_i32 s4, s4, 4
1111 ; G_SI-NEXT: v_mov_b32_e32 v1, s3
1112 ; G_SI-NEXT: s_lshl_b32 s2, s4, 3
1113 ; G_SI-NEXT: v_mov_b32_e32 v2, s2
1114 ; G_SI-NEXT: s_mov_b32 m0, -1
1115 ; G_SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
1116 ; G_SI-NEXT: s_lshl_b32 s2, s4, 4
1117 ; G_SI-NEXT: v_mov_b32_e32 v4, s2
1118 ; G_SI-NEXT: ds_max_f64 v4, v[0:1]
1119 ; G_SI-NEXT: v_mov_b32_e32 v0, s1
1120 ; G_SI-NEXT: s_waitcnt lgkmcnt(1)
1121 ; G_SI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3]
1122 ; G_SI-NEXT: v_mov_b32_e32 v2, s0
1123 ; G_SI-NEXT: s_add_u32 s0, s0, 4
1124 ; G_SI-NEXT: v_mov_b32_e32 v3, s0
1125 ; G_SI-NEXT: s_waitcnt lgkmcnt(0)
1126 ; G_SI-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
1127 ; G_SI-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen
1128 ; G_SI-NEXT: s_endpgm
1130 ; G_GFX7-LABEL: lds_ds_fmax_f64:
1132 ; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1133 ; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1134 ; G_GFX7-NEXT: s_load_dword s4, s[0:1], 0xb
1135 ; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1136 ; G_GFX7-NEXT: s_mov_b32 s10, -1
1137 ; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000
1138 ; G_GFX7-NEXT: s_add_u32 s8, s8, s3
1139 ; G_GFX7-NEXT: s_mov_b32 s2, 0
1140 ; G_GFX7-NEXT: s_addc_u32 s9, s9, 0
1141 ; G_GFX7-NEXT: s_mov_b32 s3, 0x40450000
1142 ; G_GFX7-NEXT: v_mov_b32_e32 v0, s2
1143 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
1144 ; G_GFX7-NEXT: s_add_i32 s4, s4, 4
1145 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s3
1146 ; G_GFX7-NEXT: s_lshl_b32 s2, s4, 3
1147 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s2
1148 ; G_GFX7-NEXT: s_mov_b32 m0, -1
1149 ; G_GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
1150 ; G_GFX7-NEXT: s_lshl_b32 s2, s4, 4
1151 ; G_GFX7-NEXT: v_mov_b32_e32 v4, s2
1152 ; G_GFX7-NEXT: ds_max_f64 v4, v[0:1]
1153 ; G_GFX7-NEXT: v_mov_b32_e32 v0, s1
1154 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(1)
1155 ; G_GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3]
1156 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s0
1157 ; G_GFX7-NEXT: s_add_u32 s0, s0, 4
1158 ; G_GFX7-NEXT: v_mov_b32_e32 v3, s0
1159 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
1160 ; G_GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
1161 ; G_GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen
1162 ; G_GFX7-NEXT: s_endpgm
1164 ; G_VI-LABEL: lds_ds_fmax_f64:
1166 ; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
1167 ; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
1168 ; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
1169 ; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1170 ; G_VI-NEXT: s_mov_b32 s90, -1
1171 ; G_VI-NEXT: s_mov_b32 s91, 0xe80000
1172 ; G_VI-NEXT: s_add_u32 s88, s88, s3
1173 ; G_VI-NEXT: s_mov_b32 s2, 0
1174 ; G_VI-NEXT: s_addc_u32 s89, s89, 0
1175 ; G_VI-NEXT: s_mov_b32 s3, 0x40450000
1176 ; G_VI-NEXT: v_mov_b32_e32 v0, s2
1177 ; G_VI-NEXT: s_waitcnt lgkmcnt(0)
1178 ; G_VI-NEXT: s_add_i32 s4, s4, 4
1179 ; G_VI-NEXT: v_mov_b32_e32 v1, s3
1180 ; G_VI-NEXT: s_lshl_b32 s2, s4, 3
1181 ; G_VI-NEXT: v_mov_b32_e32 v2, s2
1182 ; G_VI-NEXT: s_mov_b32 m0, -1
1183 ; G_VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
1184 ; G_VI-NEXT: s_lshl_b32 s2, s4, 4
1185 ; G_VI-NEXT: v_mov_b32_e32 v4, s2
1186 ; G_VI-NEXT: ds_max_f64 v4, v[0:1]
1187 ; G_VI-NEXT: v_mov_b32_e32 v0, s1
1188 ; G_VI-NEXT: s_waitcnt lgkmcnt(1)
1189 ; G_VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3]
1190 ; G_VI-NEXT: v_mov_b32_e32 v2, s0
1191 ; G_VI-NEXT: s_add_u32 s0, s0, 4
1192 ; G_VI-NEXT: v_mov_b32_e32 v3, s0
1193 ; G_VI-NEXT: s_waitcnt lgkmcnt(0)
1194 ; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
1195 ; G_VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
1196 ; G_VI-NEXT: s_endpgm
1198 ; G_GFX9-LABEL: lds_ds_fmax_f64:
1200 ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1201 ; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1202 ; G_GFX9-NEXT: s_mov_b32 s10, -1
1203 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000
1204 ; G_GFX9-NEXT: s_add_u32 s8, s8, s3
1205 ; G_GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
1206 ; G_GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1207 ; G_GFX9-NEXT: s_mov_b32 s0, 0
1208 ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0
1209 ; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000
1210 ; G_GFX9-NEXT: v_mov_b32_e32 v0, s0
1211 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
1212 ; G_GFX9-NEXT: s_add_i32 s4, s4, 4
1213 ; G_GFX9-NEXT: v_mov_b32_e32 v1, s1
1214 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3
1215 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s0
1216 ; G_GFX9-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
1217 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4
1218 ; G_GFX9-NEXT: v_mov_b32_e32 v5, s0
1219 ; G_GFX9-NEXT: v_mov_b32_e32 v4, s3
1220 ; G_GFX9-NEXT: ds_max_f64 v5, v[0:1]
1221 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(1)
1222 ; G_GFX9-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3]
1223 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s2
1224 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
1225 ; G_GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
1226 ; G_GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
1227 ; G_GFX9-NEXT: s_endpgm
1229 ; G_GFX10-LABEL: lds_ds_fmax_f64:
1231 ; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1232 ; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1233 ; G_GFX10-NEXT: s_mov_b32 s10, -1
1234 ; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
1235 ; G_GFX10-NEXT: s_add_u32 s8, s8, s3
1236 ; G_GFX10-NEXT: s_clause 0x1
1237 ; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
1238 ; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1239 ; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
1240 ; G_GFX10-NEXT: s_mov_b32 s0, 0
1241 ; G_GFX10-NEXT: s_mov_b32 s1, 0x40450000
1242 ; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
1243 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
1244 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
1245 ; G_GFX10-NEXT: s_add_i32 s4, s4, 4
1246 ; G_GFX10-NEXT: v_mov_b32_e32 v5, s3
1247 ; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3
1248 ; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4
1249 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s5
1250 ; G_GFX10-NEXT: v_mov_b32_e32 v4, s0
1251 ; G_GFX10-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
1252 ; G_GFX10-NEXT: ds_max_f64 v4, v[0:1]
1253 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(1)
1254 ; G_GFX10-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3]
1255 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
1256 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
1257 ; G_GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen
1258 ; G_GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4
1259 ; G_GFX10-NEXT: s_endpgm
1260 %idx.add = add nuw i32 %idx, 4
1261 %shl0 = shl i32 %idx.add, 3
1262 %shl1 = shl i32 %idx.add, 4
1263 %ptr0 = inttoptr i32 %shl0 to double addrspace(3)*
1264 %ptr1 = inttoptr i32 %shl1 to double addrspace(3)*
1265 %a1 = call double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* %ptr0, double 4.2e+1, i32 0, i32 0, i1 false)
1266 %a2 = call double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* %ptr1, double 4.2e+1, i32 0, i32 0, i1 false)
1267 %a3 = call double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* %ptrf, double %a1, i32 0, i32 0, i1 false)
1268 store double %a3, double addrspace(5)* %out