1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
9 define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
10 ; GFX6-LABEL: s_add_i32:
12 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
13 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
15 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
16 ; GFX6-NEXT: s_mov_b32 s2, -1
17 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
18 ; GFX6-NEXT: s_add_i32 s4, s4, s5
19 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
20 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
23 ; GFX8-LABEL: s_add_i32:
25 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
26 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
27 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
28 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
29 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
30 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
31 ; GFX8-NEXT: s_add_i32 s0, s2, s3
32 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
33 ; GFX8-NEXT: flat_store_dword v[0:1], v2
36 ; GFX9-LABEL: s_add_i32:
38 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
39 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
41 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
43 ; GFX9-NEXT: s_add_i32 s2, s4, s5
44 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
45 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
48 ; GFX10-LABEL: s_add_i32:
50 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
51 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
52 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
53 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
54 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
55 ; GFX10-NEXT: s_add_i32 s2, s4, s5
56 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
57 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
58 ; GFX10-NEXT: s_endpgm
60 ; GFX11-LABEL: s_add_i32:
62 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
63 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
64 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
65 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
66 ; GFX11-NEXT: s_add_i32 s2, s2, s3
67 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
68 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
69 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
70 ; GFX11-NEXT: s_endpgm
72 ; GFX12-LABEL: s_add_i32:
74 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
75 ; GFX12-NEXT: s_wait_kmcnt 0x0
76 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
77 ; GFX12-NEXT: s_wait_kmcnt 0x0
78 ; GFX12-NEXT: s_add_co_i32 s2, s2, s3
79 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
80 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
81 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
82 ; GFX12-NEXT: s_endpgm
83 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
84 %a = load i32, ptr addrspace(1) %in
85 %b = load i32, ptr addrspace(1) %b_ptr
86 %result = add i32 %a, %b
87 store i32 %result, ptr addrspace(1) %out
91 define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
92 ; GFX6-LABEL: s_add_v2i32:
94 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
95 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
97 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
98 ; GFX6-NEXT: s_mov_b32 s2, -1
99 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
100 ; GFX6-NEXT: s_add_i32 s5, s5, s7
101 ; GFX6-NEXT: s_add_i32 s4, s4, s6
102 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
103 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
104 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
105 ; GFX6-NEXT: s_endpgm
107 ; GFX8-LABEL: s_add_v2i32:
109 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
110 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
112 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
113 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
114 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
115 ; GFX8-NEXT: s_add_i32 s0, s5, s7
116 ; GFX8-NEXT: s_add_i32 s1, s4, s6
117 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
118 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
119 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
120 ; GFX8-NEXT: s_endpgm
122 ; GFX9-LABEL: s_add_v2i32:
124 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
125 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
126 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
127 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
128 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
129 ; GFX9-NEXT: s_add_i32 s2, s5, s7
130 ; GFX9-NEXT: s_add_i32 s3, s4, s6
131 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
132 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
133 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
134 ; GFX9-NEXT: s_endpgm
136 ; GFX10-LABEL: s_add_v2i32:
138 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
139 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
140 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
141 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
142 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
143 ; GFX10-NEXT: s_add_i32 s2, s4, s6
144 ; GFX10-NEXT: s_add_i32 s3, s5, s7
145 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
146 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
147 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
148 ; GFX10-NEXT: s_endpgm
150 ; GFX11-LABEL: s_add_v2i32:
152 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
153 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
155 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
156 ; GFX11-NEXT: s_add_i32 s2, s4, s6
157 ; GFX11-NEXT: s_add_i32 s3, s5, s7
158 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
159 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
160 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
161 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
162 ; GFX11-NEXT: s_endpgm
164 ; GFX12-LABEL: s_add_v2i32:
166 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
167 ; GFX12-NEXT: s_wait_kmcnt 0x0
168 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
169 ; GFX12-NEXT: s_wait_kmcnt 0x0
170 ; GFX12-NEXT: s_add_co_i32 s2, s4, s6
171 ; GFX12-NEXT: s_add_co_i32 s3, s5, s7
172 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
173 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
174 ; GFX12-NEXT: v_mov_b32_e32 v0, s2
175 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
176 ; GFX12-NEXT: s_endpgm
177 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
178 %a = load <2 x i32>, ptr addrspace(1) %in
179 %b = load <2 x i32>, ptr addrspace(1) %b_ptr
180 %result = add <2 x i32> %a, %b
181 store <2 x i32> %result, ptr addrspace(1) %out
185 define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
186 ; GFX6-LABEL: s_add_v4i32:
188 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
189 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
190 ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
191 ; GFX6-NEXT: s_mov_b32 s11, 0xf000
192 ; GFX6-NEXT: s_mov_b32 s10, -1
193 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
194 ; GFX6-NEXT: s_add_i32 s3, s3, s7
195 ; GFX6-NEXT: s_add_i32 s2, s2, s6
196 ; GFX6-NEXT: s_add_i32 s1, s1, s5
197 ; GFX6-NEXT: s_add_i32 s0, s0, s4
198 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
199 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
200 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
201 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
202 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
203 ; GFX6-NEXT: s_endpgm
205 ; GFX8-LABEL: s_add_v4i32:
207 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
208 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
209 ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
210 ; GFX8-NEXT: v_mov_b32_e32 v4, s8
211 ; GFX8-NEXT: v_mov_b32_e32 v5, s9
212 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
213 ; GFX8-NEXT: s_add_i32 s3, s3, s7
214 ; GFX8-NEXT: s_add_i32 s2, s2, s6
215 ; GFX8-NEXT: s_add_i32 s1, s1, s5
216 ; GFX8-NEXT: s_add_i32 s0, s0, s4
217 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
218 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
219 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
220 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
221 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
222 ; GFX8-NEXT: s_endpgm
224 ; GFX9-LABEL: s_add_v4i32:
226 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
227 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
228 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
229 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
230 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
231 ; GFX9-NEXT: s_add_i32 s3, s3, s7
232 ; GFX9-NEXT: s_add_i32 s2, s2, s6
233 ; GFX9-NEXT: s_add_i32 s1, s1, s5
234 ; GFX9-NEXT: s_add_i32 s0, s0, s4
235 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
236 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
237 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
238 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
239 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
240 ; GFX9-NEXT: s_endpgm
242 ; GFX10-LABEL: s_add_v4i32:
244 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
245 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
246 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
247 ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
248 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
249 ; GFX10-NEXT: s_add_i32 s3, s3, s7
250 ; GFX10-NEXT: s_add_i32 s2, s2, s6
251 ; GFX10-NEXT: s_add_i32 s0, s0, s4
252 ; GFX10-NEXT: s_add_i32 s1, s1, s5
253 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
254 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
255 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
256 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
257 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
258 ; GFX10-NEXT: s_endpgm
260 ; GFX11-LABEL: s_add_v4i32:
262 ; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
263 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
264 ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
265 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
266 ; GFX11-NEXT: s_add_i32 s3, s3, s7
267 ; GFX11-NEXT: s_add_i32 s2, s2, s6
268 ; GFX11-NEXT: s_add_i32 s0, s0, s4
269 ; GFX11-NEXT: s_add_i32 s1, s1, s5
270 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
271 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
272 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
273 ; GFX11-NEXT: v_mov_b32_e32 v2, s2
274 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[8:9]
275 ; GFX11-NEXT: s_endpgm
277 ; GFX12-LABEL: s_add_v4i32:
279 ; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
280 ; GFX12-NEXT: s_wait_kmcnt 0x0
281 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
282 ; GFX12-NEXT: s_wait_kmcnt 0x0
283 ; GFX12-NEXT: s_add_co_i32 s3, s3, s7
284 ; GFX12-NEXT: s_add_co_i32 s2, s2, s6
285 ; GFX12-NEXT: s_add_co_i32 s0, s0, s4
286 ; GFX12-NEXT: s_add_co_i32 s1, s1, s5
287 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
288 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
289 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
290 ; GFX12-NEXT: v_mov_b32_e32 v2, s2
291 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[8:9]
292 ; GFX12-NEXT: s_endpgm
293 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
294 %a = load <4 x i32>, ptr addrspace(1) %in
295 %b = load <4 x i32>, ptr addrspace(1) %b_ptr
296 %result = add <4 x i32> %a, %b
297 store <4 x i32> %result, ptr addrspace(1) %out
301 define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) {
302 ; GFX6-LABEL: s_add_v8i32:
303 ; GFX6: ; %bb.0: ; %entry
304 ; GFX6-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11
305 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
306 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
307 ; GFX6-NEXT: s_mov_b32 s2, -1
308 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
309 ; GFX6-NEXT: s_add_i32 s4, s11, s19
310 ; GFX6-NEXT: s_add_i32 s5, s10, s18
311 ; GFX6-NEXT: s_add_i32 s6, s9, s17
312 ; GFX6-NEXT: s_add_i32 s7, s8, s16
313 ; GFX6-NEXT: s_add_i32 s8, s15, s23
314 ; GFX6-NEXT: s_add_i32 s9, s14, s22
315 ; GFX6-NEXT: s_add_i32 s10, s13, s21
316 ; GFX6-NEXT: s_add_i32 s11, s12, s20
317 ; GFX6-NEXT: v_mov_b32_e32 v0, s11
318 ; GFX6-NEXT: v_mov_b32_e32 v1, s10
319 ; GFX6-NEXT: v_mov_b32_e32 v2, s9
320 ; GFX6-NEXT: v_mov_b32_e32 v3, s8
321 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
322 ; GFX6-NEXT: s_waitcnt expcnt(0)
323 ; GFX6-NEXT: v_mov_b32_e32 v0, s7
324 ; GFX6-NEXT: v_mov_b32_e32 v1, s6
325 ; GFX6-NEXT: v_mov_b32_e32 v2, s5
326 ; GFX6-NEXT: v_mov_b32_e32 v3, s4
327 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
328 ; GFX6-NEXT: s_endpgm
330 ; GFX8-LABEL: s_add_v8i32:
331 ; GFX8: ; %bb.0: ; %entry
332 ; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44
333 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
334 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
335 ; GFX8-NEXT: s_add_i32 s4, s11, s19
336 ; GFX8-NEXT: s_add_i32 s5, s10, s18
337 ; GFX8-NEXT: s_add_i32 s6, s9, s17
338 ; GFX8-NEXT: s_add_i32 s7, s8, s16
339 ; GFX8-NEXT: s_add_i32 s2, s15, s23
340 ; GFX8-NEXT: s_add_i32 s3, s14, s22
341 ; GFX8-NEXT: s_add_i32 s8, s13, s21
342 ; GFX8-NEXT: s_add_i32 s9, s12, s20
343 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
344 ; GFX8-NEXT: s_add_u32 s2, s0, 16
345 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
346 ; GFX8-NEXT: s_addc_u32 s3, s1, 0
347 ; GFX8-NEXT: v_mov_b32_e32 v5, s3
348 ; GFX8-NEXT: v_mov_b32_e32 v0, s9
349 ; GFX8-NEXT: v_mov_b32_e32 v1, s8
350 ; GFX8-NEXT: v_mov_b32_e32 v4, s2
351 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
352 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
353 ; GFX8-NEXT: v_mov_b32_e32 v0, s7
354 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
355 ; GFX8-NEXT: v_mov_b32_e32 v2, s5
356 ; GFX8-NEXT: v_mov_b32_e32 v3, s4
357 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
358 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
359 ; GFX8-NEXT: s_endpgm
361 ; GFX9-LABEL: s_add_v8i32:
362 ; GFX9: ; %bb.0: ; %entry
363 ; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44
364 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
365 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
366 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
367 ; GFX9-NEXT: s_add_i32 s4, s9, s17
368 ; GFX9-NEXT: s_add_i32 s5, s8, s16
369 ; GFX9-NEXT: s_add_i32 s6, s15, s23
370 ; GFX9-NEXT: s_add_i32 s7, s14, s22
371 ; GFX9-NEXT: s_add_i32 s8, s13, s21
372 ; GFX9-NEXT: s_add_i32 s9, s12, s20
373 ; GFX9-NEXT: s_add_i32 s2, s11, s19
374 ; GFX9-NEXT: s_add_i32 s3, s10, s18
375 ; GFX9-NEXT: v_mov_b32_e32 v0, s9
376 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
377 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
378 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
379 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
381 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
382 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
383 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
384 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
385 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
386 ; GFX9-NEXT: s_endpgm
388 ; GFX10-LABEL: s_add_v8i32:
389 ; GFX10: ; %bb.0: ; %entry
390 ; GFX10-NEXT: s_clause 0x1
391 ; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44
392 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
393 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
394 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
395 ; GFX10-NEXT: s_add_i32 s4, s9, s17
396 ; GFX10-NEXT: s_add_i32 s5, s8, s16
397 ; GFX10-NEXT: s_add_i32 s6, s15, s23
398 ; GFX10-NEXT: s_add_i32 s7, s14, s22
399 ; GFX10-NEXT: s_add_i32 s8, s12, s20
400 ; GFX10-NEXT: s_add_i32 s9, s13, s21
401 ; GFX10-NEXT: s_add_i32 s2, s11, s19
402 ; GFX10-NEXT: s_add_i32 s3, s10, s18
403 ; GFX10-NEXT: v_mov_b32_e32 v0, s8
404 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
405 ; GFX10-NEXT: v_mov_b32_e32 v2, s7
406 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
407 ; GFX10-NEXT: v_mov_b32_e32 v4, s5
408 ; GFX10-NEXT: v_mov_b32_e32 v5, s4
409 ; GFX10-NEXT: v_mov_b32_e32 v6, s3
410 ; GFX10-NEXT: v_mov_b32_e32 v7, s2
411 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
412 ; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
413 ; GFX10-NEXT: s_endpgm
415 ; GFX11-LABEL: s_add_v8i32:
416 ; GFX11: ; %bb.0: ; %entry
417 ; GFX11-NEXT: s_clause 0x1
418 ; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44
419 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
420 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
421 ; GFX11-NEXT: s_add_i32 s4, s9, s17
422 ; GFX11-NEXT: s_add_i32 s5, s8, s16
423 ; GFX11-NEXT: s_add_i32 s6, s15, s23
424 ; GFX11-NEXT: s_add_i32 s7, s14, s22
425 ; GFX11-NEXT: s_add_i32 s8, s12, s20
426 ; GFX11-NEXT: s_add_i32 s9, s13, s21
427 ; GFX11-NEXT: s_add_i32 s2, s11, s19
428 ; GFX11-NEXT: s_add_i32 s3, s10, s18
429 ; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
430 ; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6
431 ; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s4
432 ; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v7, s2
433 ; GFX11-NEXT: v_mov_b32_e32 v6, s3
434 ; GFX11-NEXT: s_clause 0x1
435 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
436 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
437 ; GFX11-NEXT: s_endpgm
439 ; GFX12-LABEL: s_add_v8i32:
440 ; GFX12: ; %bb.0: ; %entry
441 ; GFX12-NEXT: s_clause 0x1
442 ; GFX12-NEXT: s_load_b512 s[8:23], s[4:5], 0x44
443 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
444 ; GFX12-NEXT: s_wait_kmcnt 0x0
445 ; GFX12-NEXT: s_add_co_i32 s4, s9, s17
446 ; GFX12-NEXT: s_add_co_i32 s5, s8, s16
447 ; GFX12-NEXT: s_add_co_i32 s6, s15, s23
448 ; GFX12-NEXT: s_add_co_i32 s7, s14, s22
449 ; GFX12-NEXT: s_add_co_i32 s8, s12, s20
450 ; GFX12-NEXT: s_add_co_i32 s9, s13, s21
451 ; GFX12-NEXT: s_add_co_i32 s2, s11, s19
452 ; GFX12-NEXT: s_add_co_i32 s3, s10, s18
453 ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
454 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6
455 ; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s4
456 ; GFX12-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v7, s2
457 ; GFX12-NEXT: v_mov_b32_e32 v6, s3
458 ; GFX12-NEXT: s_clause 0x1
459 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
460 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
461 ; GFX12-NEXT: s_endpgm
463 %0 = add <8 x i32> %a, %b
464 store <8 x i32> %0, ptr addrspace(1) %out
468 define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <16 x i32> %b) {
469 ; GFX6-LABEL: s_add_v16i32:
470 ; GFX6: ; %bb.0: ; %entry
471 ; GFX6-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
472 ; GFX6-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x29
473 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
474 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
475 ; GFX6-NEXT: s_mov_b32 s2, -1
476 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
477 ; GFX6-NEXT: s_add_i32 s4, s11, s39
478 ; GFX6-NEXT: s_add_i32 s5, s10, s38
479 ; GFX6-NEXT: s_add_i32 s6, s9, s37
480 ; GFX6-NEXT: s_add_i32 s7, s8, s36
481 ; GFX6-NEXT: s_add_i32 s8, s15, s43
482 ; GFX6-NEXT: s_add_i32 s9, s14, s42
483 ; GFX6-NEXT: s_add_i32 s10, s13, s41
484 ; GFX6-NEXT: s_add_i32 s11, s12, s40
485 ; GFX6-NEXT: s_add_i32 s12, s19, s47
486 ; GFX6-NEXT: s_add_i32 s13, s18, s46
487 ; GFX6-NEXT: s_add_i32 s14, s17, s45
488 ; GFX6-NEXT: s_add_i32 s15, s16, s44
489 ; GFX6-NEXT: s_add_i32 s16, s23, s51
490 ; GFX6-NEXT: s_add_i32 s17, s22, s50
491 ; GFX6-NEXT: s_add_i32 s18, s21, s49
492 ; GFX6-NEXT: s_add_i32 s19, s20, s48
493 ; GFX6-NEXT: v_mov_b32_e32 v0, s19
494 ; GFX6-NEXT: v_mov_b32_e32 v1, s18
495 ; GFX6-NEXT: v_mov_b32_e32 v2, s17
496 ; GFX6-NEXT: v_mov_b32_e32 v3, s16
497 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
498 ; GFX6-NEXT: s_waitcnt expcnt(0)
499 ; GFX6-NEXT: v_mov_b32_e32 v0, s15
500 ; GFX6-NEXT: v_mov_b32_e32 v1, s14
501 ; GFX6-NEXT: v_mov_b32_e32 v2, s13
502 ; GFX6-NEXT: v_mov_b32_e32 v3, s12
503 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
504 ; GFX6-NEXT: s_waitcnt expcnt(0)
505 ; GFX6-NEXT: v_mov_b32_e32 v0, s11
506 ; GFX6-NEXT: v_mov_b32_e32 v1, s10
507 ; GFX6-NEXT: v_mov_b32_e32 v2, s9
508 ; GFX6-NEXT: v_mov_b32_e32 v3, s8
509 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
510 ; GFX6-NEXT: s_waitcnt expcnt(0)
511 ; GFX6-NEXT: v_mov_b32_e32 v0, s7
512 ; GFX6-NEXT: v_mov_b32_e32 v1, s6
513 ; GFX6-NEXT: v_mov_b32_e32 v2, s5
514 ; GFX6-NEXT: v_mov_b32_e32 v3, s4
515 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
516 ; GFX6-NEXT: s_endpgm
518 ; GFX8-LABEL: s_add_v16i32:
519 ; GFX8: ; %bb.0: ; %entry
520 ; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
521 ; GFX8-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
522 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
523 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
524 ; GFX8-NEXT: s_add_i32 s4, s11, s39
525 ; GFX8-NEXT: s_add_i32 s5, s10, s38
526 ; GFX8-NEXT: s_add_i32 s6, s9, s37
527 ; GFX8-NEXT: s_add_i32 s7, s8, s36
528 ; GFX8-NEXT: s_add_i32 s8, s15, s43
529 ; GFX8-NEXT: s_add_i32 s9, s14, s42
530 ; GFX8-NEXT: s_add_i32 s10, s13, s41
531 ; GFX8-NEXT: s_add_i32 s11, s12, s40
532 ; GFX8-NEXT: s_add_i32 s12, s19, s47
533 ; GFX8-NEXT: s_add_i32 s13, s18, s46
534 ; GFX8-NEXT: s_add_i32 s14, s17, s45
535 ; GFX8-NEXT: s_add_i32 s15, s16, s44
536 ; GFX8-NEXT: s_add_i32 s2, s23, s51
537 ; GFX8-NEXT: s_add_i32 s3, s22, s50
538 ; GFX8-NEXT: s_add_i32 s16, s21, s49
539 ; GFX8-NEXT: s_add_i32 s17, s20, s48
540 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
541 ; GFX8-NEXT: s_add_u32 s2, s0, 48
542 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
543 ; GFX8-NEXT: s_addc_u32 s3, s1, 0
544 ; GFX8-NEXT: v_mov_b32_e32 v5, s3
545 ; GFX8-NEXT: v_mov_b32_e32 v4, s2
546 ; GFX8-NEXT: s_add_u32 s2, s0, 32
547 ; GFX8-NEXT: v_mov_b32_e32 v0, s17
548 ; GFX8-NEXT: v_mov_b32_e32 v1, s16
549 ; GFX8-NEXT: s_addc_u32 s3, s1, 0
550 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
551 ; GFX8-NEXT: v_mov_b32_e32 v5, s3
552 ; GFX8-NEXT: v_mov_b32_e32 v4, s2
553 ; GFX8-NEXT: s_add_u32 s2, s0, 16
554 ; GFX8-NEXT: v_mov_b32_e32 v0, s15
555 ; GFX8-NEXT: v_mov_b32_e32 v1, s14
556 ; GFX8-NEXT: v_mov_b32_e32 v2, s13
557 ; GFX8-NEXT: v_mov_b32_e32 v3, s12
558 ; GFX8-NEXT: s_addc_u32 s3, s1, 0
559 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
560 ; GFX8-NEXT: v_mov_b32_e32 v5, s3
561 ; GFX8-NEXT: v_mov_b32_e32 v0, s11
562 ; GFX8-NEXT: v_mov_b32_e32 v1, s10
563 ; GFX8-NEXT: v_mov_b32_e32 v2, s9
564 ; GFX8-NEXT: v_mov_b32_e32 v3, s8
565 ; GFX8-NEXT: v_mov_b32_e32 v4, s2
566 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
567 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
568 ; GFX8-NEXT: v_mov_b32_e32 v0, s7
569 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
570 ; GFX8-NEXT: v_mov_b32_e32 v2, s5
571 ; GFX8-NEXT: v_mov_b32_e32 v3, s4
572 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
573 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
574 ; GFX8-NEXT: s_endpgm
576 ; GFX9-LABEL: s_add_v16i32:
577 ; GFX9: ; %bb.0: ; %entry
578 ; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
579 ; GFX9-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
580 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
581 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
582 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
583 ; GFX9-NEXT: s_add_i32 s4, s9, s37
584 ; GFX9-NEXT: s_add_i32 s5, s8, s36
585 ; GFX9-NEXT: s_add_i32 s6, s15, s43
586 ; GFX9-NEXT: s_add_i32 s7, s14, s42
587 ; GFX9-NEXT: s_add_i32 s8, s13, s41
588 ; GFX9-NEXT: s_add_i32 s9, s12, s40
589 ; GFX9-NEXT: s_add_i32 s12, s17, s45
590 ; GFX9-NEXT: s_add_i32 s13, s16, s44
591 ; GFX9-NEXT: s_add_i32 s14, s23, s51
592 ; GFX9-NEXT: s_add_i32 s15, s22, s50
593 ; GFX9-NEXT: s_add_i32 s16, s21, s49
594 ; GFX9-NEXT: s_add_i32 s17, s20, s48
595 ; GFX9-NEXT: s_add_i32 s2, s11, s39
596 ; GFX9-NEXT: s_add_i32 s3, s10, s38
597 ; GFX9-NEXT: s_add_i32 s10, s19, s47
598 ; GFX9-NEXT: s_add_i32 s11, s18, s46
599 ; GFX9-NEXT: v_mov_b32_e32 v0, s17
600 ; GFX9-NEXT: v_mov_b32_e32 v1, s16
601 ; GFX9-NEXT: v_mov_b32_e32 v2, s15
602 ; GFX9-NEXT: v_mov_b32_e32 v3, s14
603 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
605 ; GFX9-NEXT: v_mov_b32_e32 v0, s13
606 ; GFX9-NEXT: v_mov_b32_e32 v1, s12
607 ; GFX9-NEXT: v_mov_b32_e32 v2, s11
608 ; GFX9-NEXT: v_mov_b32_e32 v3, s10
609 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
611 ; GFX9-NEXT: v_mov_b32_e32 v0, s9
612 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
613 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
614 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
615 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
617 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
618 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
619 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
620 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
621 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
622 ; GFX9-NEXT: s_endpgm
624 ; GFX10-LABEL: s_add_v16i32:
625 ; GFX10: ; %bb.0: ; %entry
626 ; GFX10-NEXT: s_clause 0x2
627 ; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
628 ; GFX10-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
629 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
630 ; GFX10-NEXT: v_mov_b32_e32 v16, 0
631 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
632 ; GFX10-NEXT: s_add_i32 s4, s9, s37
633 ; GFX10-NEXT: s_add_i32 s5, s8, s36
634 ; GFX10-NEXT: s_add_i32 s6, s15, s43
635 ; GFX10-NEXT: s_add_i32 s7, s14, s42
636 ; GFX10-NEXT: s_add_i32 s8, s13, s41
637 ; GFX10-NEXT: s_add_i32 s9, s12, s40
638 ; GFX10-NEXT: s_add_i32 s12, s17, s45
639 ; GFX10-NEXT: s_add_i32 s13, s16, s44
640 ; GFX10-NEXT: s_add_i32 s14, s23, s51
641 ; GFX10-NEXT: s_add_i32 s15, s22, s50
642 ; GFX10-NEXT: s_add_i32 s16, s20, s48
643 ; GFX10-NEXT: s_add_i32 s17, s21, s49
644 ; GFX10-NEXT: s_add_i32 s2, s11, s39
645 ; GFX10-NEXT: s_add_i32 s3, s10, s38
646 ; GFX10-NEXT: s_add_i32 s10, s19, s47
647 ; GFX10-NEXT: s_add_i32 s11, s18, s46
648 ; GFX10-NEXT: v_mov_b32_e32 v0, s16
649 ; GFX10-NEXT: v_mov_b32_e32 v1, s17
650 ; GFX10-NEXT: v_mov_b32_e32 v2, s15
651 ; GFX10-NEXT: v_mov_b32_e32 v3, s14
652 ; GFX10-NEXT: v_mov_b32_e32 v4, s13
653 ; GFX10-NEXT: v_mov_b32_e32 v5, s12
654 ; GFX10-NEXT: v_mov_b32_e32 v6, s11
655 ; GFX10-NEXT: v_mov_b32_e32 v7, s10
656 ; GFX10-NEXT: v_mov_b32_e32 v8, s9
657 ; GFX10-NEXT: v_mov_b32_e32 v9, s8
658 ; GFX10-NEXT: v_mov_b32_e32 v10, s7
659 ; GFX10-NEXT: v_mov_b32_e32 v11, s6
660 ; GFX10-NEXT: v_mov_b32_e32 v12, s5
661 ; GFX10-NEXT: v_mov_b32_e32 v13, s4
662 ; GFX10-NEXT: v_mov_b32_e32 v14, s3
663 ; GFX10-NEXT: v_mov_b32_e32 v15, s2
664 ; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48
665 ; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:32
666 ; GFX10-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16
667 ; GFX10-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1]
668 ; GFX10-NEXT: s_endpgm
670 ; GFX11-LABEL: s_add_v16i32:
671 ; GFX11: ; %bb.0: ; %entry
672 ; GFX11-NEXT: s_clause 0x2
673 ; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x64
674 ; GFX11-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
675 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
676 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
677 ; GFX11-NEXT: s_add_i32 s4, s9, s37
678 ; GFX11-NEXT: s_add_i32 s5, s8, s36
679 ; GFX11-NEXT: s_add_i32 s6, s15, s43
680 ; GFX11-NEXT: s_add_i32 s7, s14, s42
681 ; GFX11-NEXT: s_add_i32 s8, s13, s41
682 ; GFX11-NEXT: s_add_i32 s9, s12, s40
683 ; GFX11-NEXT: s_add_i32 s12, s17, s45
684 ; GFX11-NEXT: s_add_i32 s13, s16, s44
685 ; GFX11-NEXT: s_add_i32 s14, s23, s51
686 ; GFX11-NEXT: s_add_i32 s15, s22, s50
687 ; GFX11-NEXT: s_add_i32 s16, s20, s48
688 ; GFX11-NEXT: s_add_i32 s17, s21, s49
689 ; GFX11-NEXT: s_add_i32 s2, s11, s39
690 ; GFX11-NEXT: s_add_i32 s3, s10, s38
691 ; GFX11-NEXT: s_add_i32 s10, s19, s47
692 ; GFX11-NEXT: s_add_i32 s11, s18, s46
693 ; GFX11-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
694 ; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s14
695 ; GFX11-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s12
696 ; GFX11-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v7, s10
697 ; GFX11-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s8
698 ; GFX11-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6
699 ; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s4
700 ; GFX11-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v15, s2
701 ; GFX11-NEXT: v_mov_b32_e32 v14, s3
702 ; GFX11-NEXT: s_clause 0x3
703 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
704 ; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
705 ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
706 ; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1]
707 ; GFX11-NEXT: s_endpgm
709 ; GFX12-LABEL: s_add_v16i32:
710 ; GFX12: ; %bb.0: ; %entry
711 ; GFX12-NEXT: s_clause 0x2
712 ; GFX12-NEXT: s_load_b512 s[8:23], s[4:5], 0x64
713 ; GFX12-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
714 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
715 ; GFX12-NEXT: s_wait_kmcnt 0x0
716 ; GFX12-NEXT: s_add_co_i32 s4, s9, s37
717 ; GFX12-NEXT: s_add_co_i32 s5, s8, s36
718 ; GFX12-NEXT: s_add_co_i32 s6, s15, s43
719 ; GFX12-NEXT: s_add_co_i32 s7, s14, s42
720 ; GFX12-NEXT: s_add_co_i32 s8, s13, s41
721 ; GFX12-NEXT: s_add_co_i32 s9, s12, s40
722 ; GFX12-NEXT: s_add_co_i32 s12, s17, s45
723 ; GFX12-NEXT: s_add_co_i32 s13, s16, s44
724 ; GFX12-NEXT: s_add_co_i32 s14, s23, s51
725 ; GFX12-NEXT: s_add_co_i32 s15, s22, s50
726 ; GFX12-NEXT: s_add_co_i32 s16, s20, s48
727 ; GFX12-NEXT: s_add_co_i32 s17, s21, s49
728 ; GFX12-NEXT: s_add_co_i32 s2, s11, s39
729 ; GFX12-NEXT: s_add_co_i32 s3, s10, s38
730 ; GFX12-NEXT: s_add_co_i32 s10, s19, s47
731 ; GFX12-NEXT: s_add_co_i32 s11, s18, s46
732 ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
733 ; GFX12-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s14
734 ; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s12
735 ; GFX12-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v7, s10
736 ; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s8
737 ; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6
738 ; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s4
739 ; GFX12-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v15, s2
740 ; GFX12-NEXT: v_mov_b32_e32 v14, s3
741 ; GFX12-NEXT: s_clause 0x3
742 ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
743 ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
744 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
745 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
746 ; GFX12-NEXT: s_endpgm
748 %0 = add <16 x i32> %a, %b
749 store <16 x i32> %0, ptr addrspace(1) %out
753 define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
754 ; GFX6-LABEL: v_add_i32:
756 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
757 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
758 ; GFX6-NEXT: s_mov_b32 s10, 0
759 ; GFX6-NEXT: s_mov_b32 s11, s7
760 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
761 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
762 ; GFX6-NEXT: s_mov_b64 s[8:9], s[2:3]
763 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
764 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
765 ; GFX6-NEXT: s_waitcnt vmcnt(0)
766 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 glc
767 ; GFX6-NEXT: s_waitcnt vmcnt(0)
768 ; GFX6-NEXT: s_mov_b32 s6, -1
769 ; GFX6-NEXT: s_mov_b32 s4, s0
770 ; GFX6-NEXT: s_mov_b32 s5, s1
771 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
772 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
773 ; GFX6-NEXT: s_endpgm
775 ; GFX8-LABEL: v_add_i32:
777 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
778 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
779 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
780 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
781 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
782 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
783 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
784 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
785 ; GFX8-NEXT: flat_load_dword v4, v[0:1] glc
786 ; GFX8-NEXT: s_waitcnt vmcnt(0)
787 ; GFX8-NEXT: flat_load_dword v2, v[2:3] glc
788 ; GFX8-NEXT: s_waitcnt vmcnt(0)
789 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
790 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
791 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
792 ; GFX8-NEXT: flat_store_dword v[0:1], v2
793 ; GFX8-NEXT: s_endpgm
795 ; GFX9-LABEL: v_add_i32:
797 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
798 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
799 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
800 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
801 ; GFX9-NEXT: s_waitcnt vmcnt(0)
802 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
803 ; GFX9-NEXT: s_waitcnt vmcnt(0)
804 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
805 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
806 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
807 ; GFX9-NEXT: s_endpgm
809 ; GFX10-LABEL: v_add_i32:
811 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
812 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
813 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
814 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
815 ; GFX10-NEXT: s_waitcnt vmcnt(0)
816 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
817 ; GFX10-NEXT: s_waitcnt vmcnt(0)
818 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
819 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2
820 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
821 ; GFX10-NEXT: s_endpgm
823 ; GFX11-LABEL: v_add_i32:
825 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
826 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
827 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
828 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
829 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
830 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
831 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
832 ; GFX11-NEXT: s_waitcnt vmcnt(0)
833 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
834 ; GFX11-NEXT: s_waitcnt vmcnt(0)
835 ; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v0
836 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
837 ; GFX11-NEXT: s_endpgm
839 ; GFX12-LABEL: v_add_i32:
841 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
842 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
843 ; GFX12-NEXT: v_mov_b32_e32 v2, 0
844 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
845 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
846 ; GFX12-NEXT: s_wait_kmcnt 0x0
847 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
848 ; GFX12-NEXT: s_wait_loadcnt 0x0
849 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 scope:SCOPE_SYS
850 ; GFX12-NEXT: s_wait_loadcnt 0x0
851 ; GFX12-NEXT: v_add_nc_u32_e32 v0, v1, v0
852 ; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
853 ; GFX12-NEXT: s_endpgm
854 %tid = call i32 @llvm.amdgcn.workitem.id.x()
855 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
856 %b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1
857 %a = load volatile i32, ptr addrspace(1) %gep
858 %b = load volatile i32, ptr addrspace(1) %b_ptr
859 %result = add i32 %a, %b
860 store i32 %result, ptr addrspace(1) %out
864 define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
865 ; GFX6-LABEL: v_add_imm_i32:
867 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
868 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
869 ; GFX6-NEXT: s_mov_b32 s10, 0
870 ; GFX6-NEXT: s_mov_b32 s11, s7
871 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
872 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
873 ; GFX6-NEXT: s_mov_b64 s[8:9], s[2:3]
874 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
875 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 glc
876 ; GFX6-NEXT: s_waitcnt vmcnt(0)
877 ; GFX6-NEXT: s_mov_b32 s6, -1
878 ; GFX6-NEXT: s_mov_b32 s4, s0
879 ; GFX6-NEXT: s_mov_b32 s5, s1
880 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0x7b, v0
881 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
882 ; GFX6-NEXT: s_endpgm
884 ; GFX8-LABEL: v_add_imm_i32:
886 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
887 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
888 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
889 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
890 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
891 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
892 ; GFX8-NEXT: flat_load_dword v2, v[0:1] glc
893 ; GFX8-NEXT: s_waitcnt vmcnt(0)
894 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
895 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
896 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7b, v2
897 ; GFX8-NEXT: flat_store_dword v[0:1], v2
898 ; GFX8-NEXT: s_endpgm
900 ; GFX9-LABEL: v_add_imm_i32:
902 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
903 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
904 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
905 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
906 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
907 ; GFX9-NEXT: s_waitcnt vmcnt(0)
908 ; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0
909 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
910 ; GFX9-NEXT: s_endpgm
912 ; GFX10-LABEL: v_add_imm_i32:
914 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
915 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
916 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
917 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
918 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
919 ; GFX10-NEXT: s_waitcnt vmcnt(0)
920 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
921 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
922 ; GFX10-NEXT: s_endpgm
924 ; GFX11-LABEL: v_add_imm_i32:
926 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
927 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
928 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
929 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
930 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
931 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
932 ; GFX11-NEXT: s_waitcnt vmcnt(0)
933 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
934 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
935 ; GFX11-NEXT: s_endpgm
937 ; GFX12-LABEL: v_add_imm_i32:
939 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
940 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
941 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
942 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
943 ; GFX12-NEXT: s_wait_kmcnt 0x0
944 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
945 ; GFX12-NEXT: s_wait_loadcnt 0x0
946 ; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
947 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
948 ; GFX12-NEXT: s_endpgm
949 %tid = call i32 @llvm.amdgcn.workitem.id.x()
950 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
951 %b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1
952 %a = load volatile i32, ptr addrspace(1) %gep
953 %result = add i32 %a, 123
954 store i32 %result, ptr addrspace(1) %out
958 define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
960 ; GFX6: ; %bb.0: ; %entry
961 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
962 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
963 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
964 ; GFX6-NEXT: s_mov_b32 s6, -1
965 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
966 ; GFX6-NEXT: s_mov_b32 s4, s0
967 ; GFX6-NEXT: s_add_u32 s0, s2, s8
968 ; GFX6-NEXT: s_mov_b32 s5, s1
969 ; GFX6-NEXT: s_addc_u32 s1, s3, s9
970 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
971 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
972 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
973 ; GFX6-NEXT: s_endpgm
976 ; GFX8: ; %bb.0: ; %entry
977 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
978 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
979 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
980 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
981 ; GFX8-NEXT: s_add_u32 s0, s2, s4
982 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
983 ; GFX8-NEXT: s_addc_u32 s1, s3, s5
984 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
985 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
986 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
987 ; GFX8-NEXT: s_endpgm
990 ; GFX9: ; %bb.0: ; %entry
991 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
992 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
993 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
994 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
995 ; GFX9-NEXT: s_add_u32 s2, s2, s6
996 ; GFX9-NEXT: s_addc_u32 s3, s3, s7
997 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
998 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
999 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1000 ; GFX9-NEXT: s_endpgm
1002 ; GFX10-LABEL: add64:
1003 ; GFX10: ; %bb.0: ; %entry
1004 ; GFX10-NEXT: s_clause 0x1
1005 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1006 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1007 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1008 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1009 ; GFX10-NEXT: s_add_u32 s2, s2, s6
1010 ; GFX10-NEXT: s_addc_u32 s3, s3, s7
1011 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
1012 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
1013 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1014 ; GFX10-NEXT: s_endpgm
1016 ; GFX11-LABEL: add64:
1017 ; GFX11: ; %bb.0: ; %entry
1018 ; GFX11-NEXT: s_clause 0x1
1019 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1020 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1021 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1022 ; GFX11-NEXT: s_add_u32 s2, s2, s4
1023 ; GFX11-NEXT: s_addc_u32 s3, s3, s5
1024 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
1025 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
1026 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1027 ; GFX11-NEXT: s_endpgm
1029 ; GFX12-LABEL: add64:
1030 ; GFX12: ; %bb.0: ; %entry
1031 ; GFX12-NEXT: s_clause 0x1
1032 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1033 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1034 ; GFX12-NEXT: s_wait_kmcnt 0x0
1035 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
1036 ; GFX12-NEXT: v_mov_b32_e32 v2, 0
1037 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1038 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1039 ; GFX12-NEXT: s_endpgm
1041 %add = add i64 %a, %b
1042 store i64 %add, ptr addrspace(1) %out
1046 ; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they
1047 ; use VCC. The test is designed so that %a will be stored in an SGPR and
1048 ; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
1049 ; to a VGPR before doing the add.
1050 define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr addrspace(1) %in) {
1051 ; GFX6-LABEL: add64_sgpr_vgpr:
1052 ; GFX6: ; %bb.0: ; %entry
1053 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
1054 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1055 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1056 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
1057 ; GFX6-NEXT: s_mov_b32 s4, s0
1058 ; GFX6-NEXT: s_mov_b32 s5, s1
1059 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1060 ; GFX6-NEXT: s_mov_b32 s6, -1
1061 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1062 ; GFX6-NEXT: s_add_u32 s0, s2, s8
1063 ; GFX6-NEXT: s_addc_u32 s1, s3, s9
1064 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
1065 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
1066 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1067 ; GFX6-NEXT: s_endpgm
1069 ; GFX8-LABEL: add64_sgpr_vgpr:
1070 ; GFX8: ; %bb.0: ; %entry
1071 ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1072 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1073 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1074 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1075 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1076 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1077 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1078 ; GFX8-NEXT: s_add_u32 s0, s2, s4
1079 ; GFX8-NEXT: s_addc_u32 s1, s3, s5
1080 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
1081 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1082 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1083 ; GFX8-NEXT: s_endpgm
1085 ; GFX9-LABEL: add64_sgpr_vgpr:
1086 ; GFX9: ; %bb.0: ; %entry
1087 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1088 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1089 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1090 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1091 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1092 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1093 ; GFX9-NEXT: s_add_u32 s2, s2, s4
1094 ; GFX9-NEXT: s_addc_u32 s3, s3, s5
1095 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1096 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1097 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1098 ; GFX9-NEXT: s_endpgm
1100 ; GFX10-LABEL: add64_sgpr_vgpr:
1101 ; GFX10: ; %bb.0: ; %entry
1102 ; GFX10-NEXT: s_clause 0x1
1103 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1104 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1105 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1106 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1107 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1108 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1109 ; GFX10-NEXT: s_add_u32 s2, s2, s4
1110 ; GFX10-NEXT: s_addc_u32 s3, s3, s5
1111 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
1112 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
1113 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1114 ; GFX10-NEXT: s_endpgm
1116 ; GFX11-LABEL: add64_sgpr_vgpr:
1117 ; GFX11: ; %bb.0: ; %entry
1118 ; GFX11-NEXT: s_clause 0x1
1119 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
1120 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1121 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1122 ; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
1123 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1124 ; GFX11-NEXT: s_add_u32 s2, s2, s4
1125 ; GFX11-NEXT: s_addc_u32 s3, s3, s5
1126 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
1127 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
1128 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1129 ; GFX11-NEXT: s_endpgm
1131 ; GFX12-LABEL: add64_sgpr_vgpr:
1132 ; GFX12: ; %bb.0: ; %entry
1133 ; GFX12-NEXT: s_clause 0x1
1134 ; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
1135 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1136 ; GFX12-NEXT: s_wait_kmcnt 0x0
1137 ; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
1138 ; GFX12-NEXT: s_wait_kmcnt 0x0
1139 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
1140 ; GFX12-NEXT: v_mov_b32_e32 v2, 0
1141 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1142 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1143 ; GFX12-NEXT: s_endpgm
1145 %0 = load i64, ptr addrspace(1) %in
1147 store i64 %1, ptr addrspace(1) %out
1151 ; Test i64 add inside a branch.
1152 define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) {
1153 ; GFX6-LABEL: add64_in_branch:
1154 ; GFX6: ; %bb.0: ; %entry
1155 ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
1156 ; GFX6-NEXT: s_mov_b64 s[8:9], 0
1157 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1158 ; GFX6-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
1159 ; GFX6-NEXT: s_and_b64 vcc, exec, s[10:11]
1160 ; GFX6-NEXT: s_cbranch_vccz .LBB9_4
1161 ; GFX6-NEXT: ; %bb.1: ; %else
1162 ; GFX6-NEXT: s_add_u32 s4, s4, s6
1163 ; GFX6-NEXT: s_addc_u32 s5, s5, s7
1164 ; GFX6-NEXT: s_andn2_b64 vcc, exec, s[8:9]
1165 ; GFX6-NEXT: s_cbranch_vccnz .LBB9_3
1166 ; GFX6-NEXT: .LBB9_2: ; %if
1167 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
1168 ; GFX6-NEXT: .LBB9_3: ; %endif
1169 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1170 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
1171 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
1172 ; GFX6-NEXT: s_mov_b32 s2, -1
1173 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
1174 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1175 ; GFX6-NEXT: s_endpgm
1176 ; GFX6-NEXT: .LBB9_4:
1177 ; GFX6-NEXT: ; implicit-def: $sgpr4_sgpr5
1178 ; GFX6-NEXT: s_branch .LBB9_2
1180 ; GFX8-LABEL: add64_in_branch:
1181 ; GFX8: ; %bb.0: ; %entry
1182 ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
1183 ; GFX8-NEXT: s_mov_b64 s[8:9], 0
1184 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1185 ; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
1186 ; GFX8-NEXT: s_cbranch_scc0 .LBB9_4
1187 ; GFX8-NEXT: ; %bb.1: ; %else
1188 ; GFX8-NEXT: s_add_u32 s4, s4, s6
1189 ; GFX8-NEXT: s_addc_u32 s5, s5, s7
1190 ; GFX8-NEXT: s_andn2_b64 vcc, exec, s[8:9]
1191 ; GFX8-NEXT: s_cbranch_vccnz .LBB9_3
1192 ; GFX8-NEXT: .LBB9_2: ; %if
1193 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
1194 ; GFX8-NEXT: .LBB9_3: ; %endif
1195 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1196 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
1197 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1198 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1199 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
1200 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1201 ; GFX8-NEXT: s_endpgm
1202 ; GFX8-NEXT: .LBB9_4:
1203 ; GFX8-NEXT: ; implicit-def: $sgpr4_sgpr5
1204 ; GFX8-NEXT: s_branch .LBB9_2
1206 ; GFX9-LABEL: add64_in_branch:
1207 ; GFX9: ; %bb.0: ; %entry
1208 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
1209 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
1210 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1211 ; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
1212 ; GFX9-NEXT: s_cbranch_scc0 .LBB9_4
1213 ; GFX9-NEXT: ; %bb.1: ; %else
1214 ; GFX9-NEXT: s_add_u32 s0, s12, s14
1215 ; GFX9-NEXT: s_addc_u32 s1, s13, s15
1216 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3]
1217 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_3
1218 ; GFX9-NEXT: .LBB9_2: ; %if
1219 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0
1220 ; GFX9-NEXT: .LBB9_3: ; %endif
1221 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1222 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1223 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1224 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1225 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
1226 ; GFX9-NEXT: s_endpgm
1227 ; GFX9-NEXT: .LBB9_4:
1228 ; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1
1229 ; GFX9-NEXT: s_branch .LBB9_2
1231 ; GFX10-LABEL: add64_in_branch:
1232 ; GFX10: ; %bb.0: ; %entry
1233 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
1234 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1235 ; GFX10-NEXT: s_cmp_lg_u64 s[12:13], 0
1236 ; GFX10-NEXT: s_cbranch_scc0 .LBB9_4
1237 ; GFX10-NEXT: ; %bb.1: ; %else
1238 ; GFX10-NEXT: s_add_u32 s0, s12, s14
1239 ; GFX10-NEXT: s_addc_u32 s1, s13, s15
1240 ; GFX10-NEXT: s_cbranch_execnz .LBB9_3
1241 ; GFX10-NEXT: .LBB9_2: ; %if
1242 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0
1243 ; GFX10-NEXT: .LBB9_3: ; %endif
1244 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1245 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1246 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1247 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1248 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
1249 ; GFX10-NEXT: s_endpgm
1250 ; GFX10-NEXT: .LBB9_4:
1251 ; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
1252 ; GFX10-NEXT: s_branch .LBB9_2
1254 ; GFX11-LABEL: add64_in_branch:
1255 ; GFX11: ; %bb.0: ; %entry
1256 ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
1257 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1258 ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
1259 ; GFX11-NEXT: s_cbranch_scc0 .LBB9_4
1260 ; GFX11-NEXT: ; %bb.1: ; %else
1261 ; GFX11-NEXT: s_add_u32 s4, s4, s6
1262 ; GFX11-NEXT: s_addc_u32 s5, s5, s7
1263 ; GFX11-NEXT: s_cbranch_execnz .LBB9_3
1264 ; GFX11-NEXT: .LBB9_2: ; %if
1265 ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
1266 ; GFX11-NEXT: .LBB9_3: ; %endif
1267 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1268 ; GFX11-NEXT: v_mov_b32_e32 v0, s4
1269 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
1270 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1271 ; GFX11-NEXT: s_endpgm
1272 ; GFX11-NEXT: .LBB9_4:
1273 ; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
1274 ; GFX11-NEXT: s_branch .LBB9_2
1276 ; GFX12-LABEL: add64_in_branch:
1277 ; GFX12: ; %bb.0: ; %entry
1278 ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
1279 ; GFX12-NEXT: s_wait_kmcnt 0x0
1280 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
1281 ; GFX12-NEXT: s_cbranch_scc0 .LBB9_4
1282 ; GFX12-NEXT: ; %bb.1: ; %else
1283 ; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
1284 ; GFX12-NEXT: s_cbranch_execnz .LBB9_3
1285 ; GFX12-NEXT: .LBB9_2: ; %if
1286 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
1287 ; GFX12-NEXT: .LBB9_3: ; %endif
1288 ; GFX12-NEXT: s_wait_kmcnt 0x0
1289 ; GFX12-NEXT: v_mov_b32_e32 v0, s4
1290 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
1291 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1292 ; GFX12-NEXT: s_endpgm
1293 ; GFX12-NEXT: .LBB9_4:
1294 ; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
1295 ; GFX12-NEXT: s_branch .LBB9_2
1297 %0 = icmp eq i64 %a, 0
1298 br i1 %0, label %if, label %else
1301 %1 = load i64, ptr addrspace(1) %in
1309 %3 = phi i64 [%1, %if], [%2, %else]
1310 store i64 %3, ptr addrspace(1) %out
1314 ; Make sure the VOP3 form of add is initially selected. Otherwise pair
1315 ; of opies from/to VCC would be necessary
1316 define amdgpu_ps void @add_select_vop3(i32 inreg %s, i32 %v) {
1317 ; GFX6-LABEL: add_select_vop3:
1319 ; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0
1320 ; GFX6-NEXT: s_mov_b32 m0, -1
1321 ; GFX6-NEXT: ;;#ASMSTART
1322 ; GFX6-NEXT: ; def vcc
1323 ; GFX6-NEXT: ;;#ASMEND
1324 ; GFX6-NEXT: ds_write_b32 v0, v0
1325 ; GFX6-NEXT: ;;#ASMSTART
1326 ; GFX6-NEXT: ; use vcc
1327 ; GFX6-NEXT: ;;#ASMEND
1328 ; GFX6-NEXT: s_endpgm
1330 ; GFX8-LABEL: add_select_vop3:
1332 ; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0
1333 ; GFX8-NEXT: s_mov_b32 m0, -1
1334 ; GFX8-NEXT: ;;#ASMSTART
1335 ; GFX8-NEXT: ; def vcc
1336 ; GFX8-NEXT: ;;#ASMEND
1337 ; GFX8-NEXT: ds_write_b32 v0, v0
1338 ; GFX8-NEXT: ;;#ASMSTART
1339 ; GFX8-NEXT: ; use vcc
1340 ; GFX8-NEXT: ;;#ASMEND
1341 ; GFX8-NEXT: s_endpgm
1343 ; GFX9-LABEL: add_select_vop3:
1345 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
1346 ; GFX9-NEXT: ;;#ASMSTART
1347 ; GFX9-NEXT: ; def vcc
1348 ; GFX9-NEXT: ;;#ASMEND
1349 ; GFX9-NEXT: ds_write_b32 v0, v0
1350 ; GFX9-NEXT: ;;#ASMSTART
1351 ; GFX9-NEXT: ; use vcc
1352 ; GFX9-NEXT: ;;#ASMEND
1353 ; GFX9-NEXT: s_endpgm
1355 ; GFX10-LABEL: add_select_vop3:
1357 ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
1358 ; GFX10-NEXT: ;;#ASMSTART
1359 ; GFX10-NEXT: ; def vcc
1360 ; GFX10-NEXT: ;;#ASMEND
1361 ; GFX10-NEXT: ds_write_b32 v0, v0
1362 ; GFX10-NEXT: ;;#ASMSTART
1363 ; GFX10-NEXT: ; use vcc
1364 ; GFX10-NEXT: ;;#ASMEND
1365 ; GFX10-NEXT: s_endpgm
1367 ; GFX11-LABEL: add_select_vop3:
1369 ; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
1370 ; GFX11-NEXT: ;;#ASMSTART
1371 ; GFX11-NEXT: ; def vcc
1372 ; GFX11-NEXT: ;;#ASMEND
1373 ; GFX11-NEXT: ds_store_b32 v0, v0
1374 ; GFX11-NEXT: ;;#ASMSTART
1375 ; GFX11-NEXT: ; use vcc
1376 ; GFX11-NEXT: ;;#ASMEND
1377 ; GFX11-NEXT: s_endpgm
1379 ; GFX12-LABEL: add_select_vop3:
1381 ; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
1382 ; GFX12-NEXT: ;;#ASMSTART
1383 ; GFX12-NEXT: ; def vcc
1384 ; GFX12-NEXT: ;;#ASMEND
1385 ; GFX12-NEXT: ds_store_b32 v0, v0
1386 ; GFX12-NEXT: ;;#ASMSTART
1387 ; GFX12-NEXT: ; use vcc
1388 ; GFX12-NEXT: ;;#ASMEND
1389 ; GFX12-NEXT: s_endpgm
1390 %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"()
1391 %sub = add i32 %v, %s
1392 store i32 %sub, ptr addrspace(3) undef
1393 call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc)
1397 declare i32 @llvm.amdgcn.workitem.id.x() #1
1399 attributes #0 = { nounwind }
1400 attributes #1 = { nounwind readnone speculatable }