1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
5 define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
6 ; GFX8-LABEL: add_v3i16:
8 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0
10 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
11 ; GFX8-NEXT: flat_load_ushort v8, v[0:1]
12 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
13 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
14 ; GFX8-NEXT: flat_load_ushort v9, v[6:7]
15 ; GFX8-NEXT: flat_load_ushort v10, v[0:1]
16 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2
17 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
18 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2
19 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
20 ; GFX8-NEXT: flat_load_ushort v11, v[2:3]
21 ; GFX8-NEXT: flat_load_ushort v12, v[0:1]
22 ; GFX8-NEXT: flat_load_ushort v6, v[6:7]
23 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4
24 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
25 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
26 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
27 ; GFX8-NEXT: s_waitcnt vmcnt(2)
28 ; GFX8-NEXT: v_add_u16_e32 v7, v8, v11
29 ; GFX8-NEXT: s_waitcnt vmcnt(1)
30 ; GFX8-NEXT: v_add_u16_e32 v8, v9, v12
31 ; GFX8-NEXT: s_waitcnt vmcnt(0)
32 ; GFX8-NEXT: v_add_u16_e32 v6, v10, v6
33 ; GFX8-NEXT: flat_store_short v[4:5], v7
34 ; GFX8-NEXT: flat_store_short v[0:1], v8
35 ; GFX8-NEXT: flat_store_short v[2:3], v6
36 ; GFX8-NEXT: s_waitcnt vmcnt(0)
37 ; GFX8-NEXT: s_setpc_b64 s[30:31]
39 ; GFX9-LABEL: add_v3i16:
41 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42 ; GFX9-NEXT: global_load_ushort v6, v[0:1], off
43 ; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
44 ; GFX9-NEXT: global_load_ushort v8, v[2:3], off
45 ; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:4
46 ; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:2
47 ; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:2
48 ; GFX9-NEXT: s_waitcnt vmcnt(5)
49 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6
50 ; GFX9-NEXT: s_waitcnt vmcnt(3)
51 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v8
52 ; GFX9-NEXT: s_waitcnt vmcnt(2)
53 ; GFX9-NEXT: v_pk_add_u16 v2, v7, v9
54 ; GFX9-NEXT: s_waitcnt vmcnt(1)
55 ; GFX9-NEXT: v_lshl_or_b32 v0, v10, 16, v0
56 ; GFX9-NEXT: s_waitcnt vmcnt(0)
57 ; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1
58 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
59 ; GFX9-NEXT: global_store_short v[4:5], v0, off
60 ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
61 ; GFX9-NEXT: global_store_short v[4:5], v2, off offset:4
62 ; GFX9-NEXT: s_waitcnt vmcnt(0)
63 ; GFX9-NEXT: s_setpc_b64 s[30:31]
64 %a = load <3 x i16>, ptr addrspace(1) %ptra, align 4
65 %b = load <3 x i16>, ptr addrspace(1) %ptrb, align 4
66 %add = add <3 x i16> %a, %b
67 store <3 x i16> %add, ptr addrspace(1) %ptr2, align 4
71 define <3 x i16> @add_v3i16_arg(<3 x i16> %a, <3 x i16> %b) {
72 ; GFX8-LABEL: add_v3i16_arg:
74 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75 ; GFX8-NEXT: v_add_u16_e32 v4, v0, v2
76 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
77 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
78 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v3
79 ; GFX8-NEXT: s_setpc_b64 s[30:31]
81 ; GFX9-LABEL: add_v3i16_arg:
83 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
84 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v2
85 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v3
86 ; GFX9-NEXT: s_setpc_b64 s[30:31]
87 %add = add <3 x i16> %a, %b
91 define void @add_v4i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
92 ; GFX8-LABEL: add_v4i16:
94 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
96 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
97 ; GFX8-NEXT: s_waitcnt vmcnt(0)
98 ; GFX8-NEXT: v_add_u16_e32 v6, v0, v2
99 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
100 ; GFX8-NEXT: v_add_u16_e32 v2, v1, v3
101 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
102 ; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
103 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
104 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
105 ; GFX8-NEXT: s_waitcnt vmcnt(0)
106 ; GFX8-NEXT: s_setpc_b64 s[30:31]
108 ; GFX9-LABEL: add_v4i16:
110 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
112 ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
113 ; GFX9-NEXT: s_waitcnt vmcnt(0)
114 ; GFX9-NEXT: v_pk_add_u16 v0, v6, v8
115 ; GFX9-NEXT: v_pk_add_u16 v1, v7, v9
116 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
117 ; GFX9-NEXT: s_waitcnt vmcnt(0)
118 ; GFX9-NEXT: s_setpc_b64 s[30:31]
119 %a = load <4 x i16>, ptr addrspace(1) %ptra, align 4
120 %b = load <4 x i16>, ptr addrspace(1) %ptrb, align 4
121 %add = add <4 x i16> %a, %b
122 store <4 x i16> %add, ptr addrspace(1) %ptr2, align 4
126 define <4 x i16> @add_v4i16_arg(<4 x i16> %a, <4 x i16> %b) {
127 ; GFX8-LABEL: add_v4i16_arg:
129 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130 ; GFX8-NEXT: v_add_u16_e32 v4, v0, v2
131 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
132 ; GFX8-NEXT: v_add_u16_e32 v2, v1, v3
133 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
134 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
135 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
136 ; GFX8-NEXT: s_setpc_b64 s[30:31]
138 ; GFX9-LABEL: add_v4i16_arg:
140 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v2
142 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v3
143 ; GFX9-NEXT: s_setpc_b64 s[30:31]
144 %add = add <4 x i16> %a, %b
148 define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
149 ; GFX8-LABEL: add_v5i16:
151 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0
153 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
154 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0
155 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
156 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0
157 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
158 ; GFX8-NEXT: flat_load_ushort v12, v[0:1]
159 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
160 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
161 ; GFX8-NEXT: flat_load_ushort v13, v[6:7]
162 ; GFX8-NEXT: flat_load_ushort v14, v[8:9]
163 ; GFX8-NEXT: flat_load_ushort v15, v[10:11]
164 ; GFX8-NEXT: flat_load_ushort v16, v[0:1]
165 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2
166 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
167 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2
168 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
169 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2
170 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
171 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2
172 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
173 ; GFX8-NEXT: flat_load_ushort v17, v[2:3]
174 ; GFX8-NEXT: flat_load_ushort v18, v[0:1]
175 ; GFX8-NEXT: flat_load_ushort v19, v[6:7]
176 ; GFX8-NEXT: flat_load_ushort v20, v[8:9]
177 ; GFX8-NEXT: flat_load_ushort v10, v[10:11]
178 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4
179 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
180 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
181 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
182 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 6, v4
183 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
184 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 8, v4
185 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
186 ; GFX8-NEXT: s_waitcnt vmcnt(4)
187 ; GFX8-NEXT: v_add_u16_e32 v11, v12, v17
188 ; GFX8-NEXT: s_waitcnt vmcnt(3)
189 ; GFX8-NEXT: v_add_u16_e32 v12, v13, v18
190 ; GFX8-NEXT: s_waitcnt vmcnt(2)
191 ; GFX8-NEXT: v_add_u16_e32 v13, v14, v19
192 ; GFX8-NEXT: s_waitcnt vmcnt(1)
193 ; GFX8-NEXT: v_add_u16_e32 v14, v15, v20
194 ; GFX8-NEXT: s_waitcnt vmcnt(0)
195 ; GFX8-NEXT: v_add_u16_e32 v10, v16, v10
196 ; GFX8-NEXT: flat_store_short v[4:5], v11
197 ; GFX8-NEXT: flat_store_short v[0:1], v12
198 ; GFX8-NEXT: flat_store_short v[2:3], v13
199 ; GFX8-NEXT: flat_store_short v[6:7], v14
200 ; GFX8-NEXT: flat_store_short v[8:9], v10
201 ; GFX8-NEXT: s_waitcnt vmcnt(0)
202 ; GFX8-NEXT: s_setpc_b64 s[30:31]
204 ; GFX9-LABEL: add_v5i16:
206 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207 ; GFX9-NEXT: global_load_ushort v6, v[0:1], off
208 ; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
209 ; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
210 ; GFX9-NEXT: global_load_ushort v9, v[2:3], off
211 ; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:4
212 ; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:8
213 ; GFX9-NEXT: global_load_ushort v12, v[0:1], off offset:2
214 ; GFX9-NEXT: global_load_ushort v13, v[0:1], off offset:6
215 ; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:2
216 ; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:6
217 ; GFX9-NEXT: s_waitcnt vmcnt(9)
218 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6
219 ; GFX9-NEXT: s_waitcnt vmcnt(8)
220 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v7
221 ; GFX9-NEXT: s_waitcnt vmcnt(6)
222 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v9
223 ; GFX9-NEXT: s_waitcnt vmcnt(5)
224 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v10
225 ; GFX9-NEXT: s_waitcnt vmcnt(4)
226 ; GFX9-NEXT: v_pk_add_u16 v6, v8, v11
227 ; GFX9-NEXT: s_waitcnt vmcnt(3)
228 ; GFX9-NEXT: v_lshl_or_b32 v0, v12, 16, v0
229 ; GFX9-NEXT: s_waitcnt vmcnt(2)
230 ; GFX9-NEXT: v_lshl_or_b32 v1, v13, 16, v1
231 ; GFX9-NEXT: s_waitcnt vmcnt(1)
232 ; GFX9-NEXT: v_lshl_or_b32 v2, v14, 16, v2
233 ; GFX9-NEXT: s_waitcnt vmcnt(0)
234 ; GFX9-NEXT: v_lshl_or_b32 v3, v15, 16, v3
235 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v2
236 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v3
237 ; GFX9-NEXT: global_store_short v[4:5], v0, off
238 ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
239 ; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
240 ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:6
241 ; GFX9-NEXT: global_store_short v[4:5], v6, off offset:8
242 ; GFX9-NEXT: s_waitcnt vmcnt(0)
243 ; GFX9-NEXT: s_setpc_b64 s[30:31]
244 %a = load <5 x i16>, ptr addrspace(1) %ptra, align 4
245 %b = load <5 x i16>, ptr addrspace(1) %ptrb, align 4
246 %add = add <5 x i16> %a, %b
247 store <5 x i16> %add, ptr addrspace(1) %ptr2, align 4
251 define <5 x i16> @add_v5i16_arg(<5 x i16> %a, <5 x i16> %b) {
252 ; GFX8-LABEL: add_v5i16_arg:
254 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
255 ; GFX8-NEXT: v_add_u16_e32 v6, v0, v3
256 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
257 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v4
258 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
259 ; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
260 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
261 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v5
262 ; GFX8-NEXT: s_setpc_b64 s[30:31]
264 ; GFX9-LABEL: add_v5i16_arg:
266 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
267 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v3
268 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v4
269 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v5
270 ; GFX9-NEXT: s_setpc_b64 s[30:31]
271 %add = add <5 x i16> %a, %b
275 define void @add_v6i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
276 ; GFX8-LABEL: add_v6i16:
278 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279 ; GFX8-NEXT: flat_load_dwordx3 v[6:8], v[0:1]
280 ; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[2:3]
281 ; GFX8-NEXT: s_waitcnt vmcnt(0)
282 ; GFX8-NEXT: v_add_u16_e32 v3, v6, v0
283 ; GFX8-NEXT: v_add_u16_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
284 ; GFX8-NEXT: v_add_u16_e32 v6, v7, v1
285 ; GFX8-NEXT: v_add_u16_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
286 ; GFX8-NEXT: v_add_u16_e32 v7, v8, v2
287 ; GFX8-NEXT: v_add_u16_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
288 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
289 ; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
290 ; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
291 ; GFX8-NEXT: flat_store_dwordx3 v[4:5], v[0:2]
292 ; GFX8-NEXT: s_waitcnt vmcnt(0)
293 ; GFX8-NEXT: s_setpc_b64 s[30:31]
295 ; GFX9-LABEL: add_v6i16:
297 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298 ; GFX9-NEXT: global_load_dwordx3 v[6:8], v[0:1], off
299 ; GFX9-NEXT: global_load_dwordx3 v[9:11], v[2:3], off
300 ; GFX9-NEXT: s_waitcnt vmcnt(0)
301 ; GFX9-NEXT: v_pk_add_u16 v0, v6, v9
302 ; GFX9-NEXT: v_pk_add_u16 v1, v7, v10
303 ; GFX9-NEXT: v_pk_add_u16 v2, v8, v11
304 ; GFX9-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
305 ; GFX9-NEXT: s_waitcnt vmcnt(0)
306 ; GFX9-NEXT: s_setpc_b64 s[30:31]
307 %a = load <6 x i16>, ptr addrspace(1) %ptra, align 4
308 %b = load <6 x i16>, ptr addrspace(1) %ptrb, align 4
309 %add = add <6 x i16> %a, %b
310 store <6 x i16> %add, ptr addrspace(1) %ptr2, align 4
314 define <6 x i16> @add_v6i16_arg(<6 x i16> %a, <6 x i16> %b) {
315 ; GFX8-LABEL: add_v6i16_arg:
317 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318 ; GFX8-NEXT: v_add_u16_e32 v6, v0, v3
319 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
320 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v4
321 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
322 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
323 ; GFX8-NEXT: v_add_u16_e32 v3, v2, v5
324 ; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
325 ; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
326 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
327 ; GFX8-NEXT: s_setpc_b64 s[30:31]
329 ; GFX9-LABEL: add_v6i16_arg:
331 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v3
333 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v4
334 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v5
335 ; GFX9-NEXT: s_setpc_b64 s[30:31]
336 %add = add <6 x i16> %a, %b
340 define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
341 ; GFX8-LABEL: addv_7i16:
343 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0
345 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
346 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0
347 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
348 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0
349 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
350 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 8, v0
351 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
352 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 10, v0
353 ; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc
354 ; GFX8-NEXT: flat_load_ushort v16, v[0:1]
355 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0
356 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
357 ; GFX8-NEXT: flat_load_ushort v17, v[6:7]
358 ; GFX8-NEXT: flat_load_ushort v18, v[8:9]
359 ; GFX8-NEXT: flat_load_ushort v19, v[10:11]
360 ; GFX8-NEXT: flat_load_ushort v20, v[12:13]
361 ; GFX8-NEXT: flat_load_ushort v21, v[14:15]
362 ; GFX8-NEXT: flat_load_ushort v22, v[0:1]
363 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2
364 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
365 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2
366 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
367 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2
368 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
369 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2
370 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
371 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 10, v2
372 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc
373 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 12, v2
374 ; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
375 ; GFX8-NEXT: flat_load_ushort v2, v[2:3]
376 ; GFX8-NEXT: flat_load_ushort v3, v[0:1]
377 ; GFX8-NEXT: flat_load_ushort v6, v[6:7]
378 ; GFX8-NEXT: flat_load_ushort v7, v[8:9]
379 ; GFX8-NEXT: flat_load_ushort v8, v[10:11]
380 ; GFX8-NEXT: flat_load_ushort v9, v[12:13]
381 ; GFX8-NEXT: flat_load_ushort v10, v[14:15]
382 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4
383 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
384 ; GFX8-NEXT: s_waitcnt vmcnt(6)
385 ; GFX8-NEXT: v_add_u16_e32 v2, v16, v2
386 ; GFX8-NEXT: s_waitcnt vmcnt(5)
387 ; GFX8-NEXT: v_add_u16_e32 v3, v17, v3
388 ; GFX8-NEXT: flat_store_short v[4:5], v2
389 ; GFX8-NEXT: flat_store_short v[0:1], v3
390 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
391 ; GFX8-NEXT: s_waitcnt vmcnt(6)
392 ; GFX8-NEXT: v_add_u16_e32 v6, v18, v6
393 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
394 ; GFX8-NEXT: flat_store_short v[0:1], v6
395 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v4
396 ; GFX8-NEXT: s_waitcnt vmcnt(6)
397 ; GFX8-NEXT: v_add_u16_e32 v7, v19, v7
398 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
399 ; GFX8-NEXT: flat_store_short v[0:1], v7
400 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4
401 ; GFX8-NEXT: s_waitcnt vmcnt(6)
402 ; GFX8-NEXT: v_add_u16_e32 v8, v20, v8
403 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
404 ; GFX8-NEXT: flat_store_short v[0:1], v8
405 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v4
406 ; GFX8-NEXT: s_waitcnt vmcnt(6)
407 ; GFX8-NEXT: v_add_u16_e32 v9, v21, v9
408 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
409 ; GFX8-NEXT: flat_store_short v[0:1], v9
410 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v4
411 ; GFX8-NEXT: s_waitcnt vmcnt(6)
412 ; GFX8-NEXT: v_add_u16_e32 v10, v22, v10
413 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
414 ; GFX8-NEXT: flat_store_short v[0:1], v10
415 ; GFX8-NEXT: s_waitcnt vmcnt(0)
416 ; GFX8-NEXT: s_setpc_b64 s[30:31]
418 ; GFX9-LABEL: addv_7i16:
420 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421 ; GFX9-NEXT: global_load_ushort v6, v[0:1], off
422 ; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
423 ; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
424 ; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:12
425 ; GFX9-NEXT: global_load_ushort v10, v[2:3], off
426 ; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:4
427 ; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:8
428 ; GFX9-NEXT: global_load_ushort v13, v[2:3], off offset:12
429 ; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:2
430 ; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:6
431 ; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:10
432 ; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:2
433 ; GFX9-NEXT: global_load_ushort v18, v[2:3], off offset:6
434 ; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:10
435 ; GFX9-NEXT: s_waitcnt vmcnt(13)
436 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6
437 ; GFX9-NEXT: s_waitcnt vmcnt(12)
438 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v7
439 ; GFX9-NEXT: s_waitcnt vmcnt(11)
440 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v8
441 ; GFX9-NEXT: s_waitcnt vmcnt(9)
442 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v10
443 ; GFX9-NEXT: s_waitcnt vmcnt(8)
444 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v11
445 ; GFX9-NEXT: s_waitcnt vmcnt(7)
446 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v12
447 ; GFX9-NEXT: s_waitcnt vmcnt(6)
448 ; GFX9-NEXT: v_pk_add_u16 v8, v9, v13
449 ; GFX9-NEXT: s_waitcnt vmcnt(5)
450 ; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0
451 ; GFX9-NEXT: s_waitcnt vmcnt(4)
452 ; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1
453 ; GFX9-NEXT: s_waitcnt vmcnt(3)
454 ; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2
455 ; GFX9-NEXT: s_waitcnt vmcnt(2)
456 ; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3
457 ; GFX9-NEXT: s_waitcnt vmcnt(1)
458 ; GFX9-NEXT: v_lshl_or_b32 v6, v18, 16, v6
459 ; GFX9-NEXT: s_waitcnt vmcnt(0)
460 ; GFX9-NEXT: v_lshl_or_b32 v7, v19, 16, v7
461 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v3
462 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v6
463 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v7
464 ; GFX9-NEXT: global_store_short v[4:5], v0, off
465 ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
466 ; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
467 ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:6
468 ; GFX9-NEXT: global_store_short v[4:5], v2, off offset:8
469 ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v2, off offset:10
470 ; GFX9-NEXT: global_store_short v[4:5], v8, off offset:12
471 ; GFX9-NEXT: s_waitcnt vmcnt(0)
472 ; GFX9-NEXT: s_setpc_b64 s[30:31]
473 %a = load <7 x i16>, ptr addrspace(1) %ptra, align 4
474 %b = load <7 x i16>, ptr addrspace(1) %ptrb, align 4
475 %add = add <7 x i16> %a, %b
476 store <7 x i16> %add, ptr addrspace(1) %ptr2, align 4
480 define <7 x i16> @add_v7i16_arg(<7 x i16> %a, <7 x i16> %b) {
481 ; GFX8-LABEL: add_v7i16_arg:
483 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
484 ; GFX8-NEXT: v_add_u16_e32 v8, v0, v4
485 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
486 ; GFX8-NEXT: v_add_u16_e32 v4, v1, v5
487 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
488 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
489 ; GFX8-NEXT: v_add_u16_e32 v4, v2, v6
490 ; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
491 ; GFX8-NEXT: v_or_b32_e32 v0, v8, v0
492 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
493 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v7
494 ; GFX8-NEXT: s_setpc_b64 s[30:31]
496 ; GFX9-LABEL: add_v7i16_arg:
498 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
499 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v4
500 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v5
501 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v6
502 ; GFX9-NEXT: v_pk_add_u16 v3, v3, v7
503 ; GFX9-NEXT: s_setpc_b64 s[30:31]
504 %add = add <7 x i16> %a, %b
508 define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
509 ; GFX8-LABEL: add_v9i16:
511 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512 ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
513 ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
514 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
515 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
516 ; GFX8-NEXT: flat_load_ushort v16, v[0:1]
517 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
518 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
519 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
520 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4
521 ; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc
522 ; GFX8-NEXT: s_waitcnt vmcnt(2)
523 ; GFX8-NEXT: v_add_u16_e32 v1, v6, v10
524 ; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
525 ; GFX8-NEXT: v_add_u16_e32 v3, v7, v11
526 ; GFX8-NEXT: v_add_u16_sdwa v6, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
527 ; GFX8-NEXT: v_add_u16_e32 v7, v8, v12
528 ; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
529 ; GFX8-NEXT: v_add_u16_e32 v10, v9, v13
530 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
531 ; GFX8-NEXT: s_waitcnt vmcnt(0)
532 ; GFX8-NEXT: v_add_u16_e32 v11, v16, v0
533 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v2
534 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v6
535 ; GFX8-NEXT: v_or_b32_e32 v2, v7, v8
536 ; GFX8-NEXT: v_or_b32_e32 v3, v10, v9
537 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
538 ; GFX8-NEXT: flat_store_short v[14:15], v11
539 ; GFX8-NEXT: s_waitcnt vmcnt(0)
540 ; GFX8-NEXT: s_setpc_b64 s[30:31]
542 ; GFX9-LABEL: add_v9i16:
544 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
545 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
546 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
547 ; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16
548 ; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16
549 ; GFX9-NEXT: s_waitcnt vmcnt(2)
550 ; GFX9-NEXT: v_pk_add_u16 v0, v10, v6
551 ; GFX9-NEXT: v_pk_add_u16 v1, v11, v7
552 ; GFX9-NEXT: v_pk_add_u16 v2, v12, v8
553 ; GFX9-NEXT: v_pk_add_u16 v3, v13, v9
554 ; GFX9-NEXT: s_waitcnt vmcnt(0)
555 ; GFX9-NEXT: v_pk_add_u16 v6, v14, v15
556 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
557 ; GFX9-NEXT: global_store_short v[4:5], v6, off offset:16
558 ; GFX9-NEXT: s_waitcnt vmcnt(0)
559 ; GFX9-NEXT: s_setpc_b64 s[30:31]
560 %a = load <9 x i16>, ptr addrspace(1) %ptra, align 4
561 %b = load <9 x i16>, ptr addrspace(1) %ptrb, align 4
562 %add = add <9 x i16> %a, %b
563 store <9 x i16> %add, ptr addrspace(1) %ptr2, align 4
567 define <9 x i16> @add_v9i16_arg(<9 x i16> %a, <9 x i16> %b) {
568 ; GFX8-LABEL: add_v9i16_arg:
570 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
571 ; GFX8-NEXT: v_add_u16_e32 v10, v0, v5
572 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
573 ; GFX8-NEXT: v_add_u16_e32 v5, v1, v6
574 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
575 ; GFX8-NEXT: v_or_b32_e32 v1, v5, v1
576 ; GFX8-NEXT: v_add_u16_e32 v5, v2, v7
577 ; GFX8-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
578 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
579 ; GFX8-NEXT: v_add_u16_e32 v5, v3, v8
580 ; GFX8-NEXT: v_add_u16_sdwa v3, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
581 ; GFX8-NEXT: v_or_b32_e32 v0, v10, v0
582 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
583 ; GFX8-NEXT: v_add_u16_e32 v4, v4, v9
584 ; GFX8-NEXT: s_setpc_b64 s[30:31]
586 ; GFX9-LABEL: add_v9i16_arg:
588 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
589 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v5
590 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v6
591 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v7
592 ; GFX9-NEXT: v_pk_add_u16 v3, v3, v8
593 ; GFX9-NEXT: v_pk_add_u16 v4, v4, v9
594 ; GFX9-NEXT: s_setpc_b64 s[30:31]
595 %add = add <9 x i16> %a, %b
599 define void @add_v10i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
600 ; GFX8-LABEL: add_v10i16:
602 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
603 ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
604 ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
605 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
606 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
607 ; GFX8-NEXT: flat_load_dword v14, v[0:1]
608 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
609 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
610 ; GFX8-NEXT: flat_load_dword v15, v[0:1]
611 ; GFX8-NEXT: s_waitcnt vmcnt(2)
612 ; GFX8-NEXT: v_add_u16_e32 v0, v6, v10
613 ; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
614 ; GFX8-NEXT: v_add_u16_e32 v2, v7, v11
615 ; GFX8-NEXT: v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
616 ; GFX8-NEXT: v_add_u16_e32 v6, v8, v12
617 ; GFX8-NEXT: v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
618 ; GFX8-NEXT: v_add_u16_e32 v8, v9, v13
619 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
620 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
621 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
622 ; GFX8-NEXT: v_or_b32_e32 v2, v6, v7
623 ; GFX8-NEXT: v_or_b32_e32 v3, v8, v9
624 ; GFX8-NEXT: s_waitcnt vmcnt(0)
625 ; GFX8-NEXT: v_add_u16_e32 v6, v14, v15
626 ; GFX8-NEXT: v_add_u16_sdwa v7, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
627 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
628 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
629 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4
630 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
631 ; GFX8-NEXT: flat_store_dword v[0:1], v6
632 ; GFX8-NEXT: s_waitcnt vmcnt(0)
633 ; GFX8-NEXT: s_setpc_b64 s[30:31]
635 ; GFX9-LABEL: add_v10i16:
637 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
639 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
640 ; GFX9-NEXT: global_load_dword v14, v[0:1], off offset:16
641 ; GFX9-NEXT: global_load_dword v15, v[2:3], off offset:16
642 ; GFX9-NEXT: s_waitcnt vmcnt(2)
643 ; GFX9-NEXT: v_pk_add_u16 v0, v10, v6
644 ; GFX9-NEXT: v_pk_add_u16 v1, v11, v7
645 ; GFX9-NEXT: v_pk_add_u16 v2, v12, v8
646 ; GFX9-NEXT: v_pk_add_u16 v3, v13, v9
647 ; GFX9-NEXT: s_waitcnt vmcnt(0)
648 ; GFX9-NEXT: v_pk_add_u16 v6, v14, v15
649 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
650 ; GFX9-NEXT: global_store_dword v[4:5], v6, off offset:16
651 ; GFX9-NEXT: s_waitcnt vmcnt(0)
652 ; GFX9-NEXT: s_setpc_b64 s[30:31]
653 %a = load <10 x i16>, ptr addrspace(1) %ptra, align 4
654 %b = load <10 x i16>, ptr addrspace(1) %ptrb, align 4
655 %add = add <10 x i16> %a, %b
656 store <10 x i16> %add, ptr addrspace(1) %ptr2, align 4
660 define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
661 ; GFX8-LABEL: add_v11i16:
663 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
664 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 16, v0
665 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
666 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 18, v0
667 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
668 ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
669 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0
670 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
671 ; GFX8-NEXT: flat_load_ushort v18, v[10:11]
672 ; GFX8-NEXT: flat_load_ushort v19, v[12:13]
673 ; GFX8-NEXT: flat_load_ushort v20, v[0:1]
674 ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
675 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
676 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
677 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 18, v2
678 ; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
679 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2
680 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
681 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
682 ; GFX8-NEXT: flat_load_ushort v1, v[14:15]
683 ; GFX8-NEXT: flat_load_ushort v2, v[2:3]
684 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4
685 ; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc
686 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v4
687 ; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v5, vcc
688 ; GFX8-NEXT: s_waitcnt vmcnt(3)
689 ; GFX8-NEXT: v_add_u16_e32 v3, v6, v10
690 ; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
691 ; GFX8-NEXT: v_add_u16_e32 v21, v7, v11
692 ; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
693 ; GFX8-NEXT: v_add_u16_e32 v22, v8, v12
694 ; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
695 ; GFX8-NEXT: v_add_u16_e32 v12, v9, v13
696 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
697 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 20, v4
698 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
699 ; GFX8-NEXT: s_waitcnt vmcnt(2)
700 ; GFX8-NEXT: v_add_u16_e32 v13, v18, v0
701 ; GFX8-NEXT: s_waitcnt vmcnt(1)
702 ; GFX8-NEXT: v_add_u16_e32 v18, v19, v1
703 ; GFX8-NEXT: s_waitcnt vmcnt(0)
704 ; GFX8-NEXT: v_add_u16_e32 v19, v20, v2
705 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v10
706 ; GFX8-NEXT: v_or_b32_e32 v1, v21, v11
707 ; GFX8-NEXT: v_or_b32_e32 v2, v22, v8
708 ; GFX8-NEXT: v_or_b32_e32 v3, v12, v9
709 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
710 ; GFX8-NEXT: flat_store_short v[14:15], v13
711 ; GFX8-NEXT: flat_store_short v[16:17], v18
712 ; GFX8-NEXT: flat_store_short v[6:7], v19
713 ; GFX8-NEXT: s_waitcnt vmcnt(0)
714 ; GFX8-NEXT: s_setpc_b64 s[30:31]
716 ; GFX9-LABEL: add_v11i16:
718 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
719 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
720 ; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16
721 ; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16
722 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off
723 ; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:20
724 ; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:20
725 ; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:18
726 ; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:18
727 ; GFX9-NEXT: s_waitcnt vmcnt(6)
728 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
729 ; GFX9-NEXT: s_waitcnt vmcnt(5)
730 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15
731 ; GFX9-NEXT: s_waitcnt vmcnt(4)
732 ; GFX9-NEXT: v_pk_add_u16 v0, v6, v10
733 ; GFX9-NEXT: v_pk_add_u16 v1, v7, v11
734 ; GFX9-NEXT: v_pk_add_u16 v2, v8, v12
735 ; GFX9-NEXT: v_pk_add_u16 v3, v9, v13
736 ; GFX9-NEXT: s_waitcnt vmcnt(1)
737 ; GFX9-NEXT: v_lshl_or_b32 v7, v18, 16, v14
738 ; GFX9-NEXT: s_waitcnt vmcnt(0)
739 ; GFX9-NEXT: v_lshl_or_b32 v8, v19, 16, v15
740 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
741 ; GFX9-NEXT: v_pk_add_u16 v6, v16, v17
742 ; GFX9-NEXT: v_pk_add_u16 v0, v7, v8
743 ; GFX9-NEXT: global_store_short v[4:5], v0, off offset:16
744 ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:18
745 ; GFX9-NEXT: global_store_short v[4:5], v6, off offset:20
746 ; GFX9-NEXT: s_waitcnt vmcnt(0)
747 ; GFX9-NEXT: s_setpc_b64 s[30:31]
748 %a = load <11 x i16>, ptr addrspace(1) %ptra, align 4
749 %b = load <11 x i16>, ptr addrspace(1) %ptrb, align 4
750 %add = add <11 x i16> %a, %b
751 store <11 x i16> %add, ptr addrspace(1) %ptr2, align 4
755 define <11 x i16> @add_v11i16_arg(<11 x i16> %a, <11 x i16> %b) {
756 ; GFX8-LABEL: add_v11i16_arg:
758 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
759 ; GFX8-NEXT: v_add_u16_e32 v12, v0, v6
760 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
761 ; GFX8-NEXT: v_add_u16_e32 v6, v1, v7
762 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
763 ; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
764 ; GFX8-NEXT: v_add_u16_e32 v6, v2, v8
765 ; GFX8-NEXT: v_add_u16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
766 ; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
767 ; GFX8-NEXT: v_add_u16_e32 v6, v3, v9
768 ; GFX8-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
769 ; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
770 ; GFX8-NEXT: v_add_u16_e32 v6, v4, v10
771 ; GFX8-NEXT: v_add_u16_sdwa v4, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
772 ; GFX8-NEXT: v_or_b32_e32 v0, v12, v0
773 ; GFX8-NEXT: v_or_b32_e32 v4, v6, v4
774 ; GFX8-NEXT: v_add_u16_e32 v5, v5, v11
775 ; GFX8-NEXT: s_setpc_b64 s[30:31]
777 ; GFX9-LABEL: add_v11i16_arg:
779 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
780 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v6
781 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v7
782 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v8
783 ; GFX9-NEXT: v_pk_add_u16 v3, v3, v9
784 ; GFX9-NEXT: v_pk_add_u16 v4, v4, v10
785 ; GFX9-NEXT: v_pk_add_u16 v5, v5, v11
786 ; GFX9-NEXT: s_setpc_b64 s[30:31]
787 %add = add <11 x i16> %a, %b
791 define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
792 ; GFX8-LABEL: add_v12i16:
794 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
795 ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
796 ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
797 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
798 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
799 ; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[0:1]
800 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
801 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
802 ; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[0:1]
803 ; GFX8-NEXT: s_waitcnt vmcnt(2)
804 ; GFX8-NEXT: v_add_u16_e32 v0, v6, v10
805 ; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
806 ; GFX8-NEXT: v_add_u16_e32 v2, v7, v11
807 ; GFX8-NEXT: v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
808 ; GFX8-NEXT: v_add_u16_e32 v6, v8, v12
809 ; GFX8-NEXT: v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
810 ; GFX8-NEXT: v_add_u16_e32 v8, v9, v13
811 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
812 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
813 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
814 ; GFX8-NEXT: v_or_b32_e32 v2, v6, v7
815 ; GFX8-NEXT: v_or_b32_e32 v3, v8, v9
816 ; GFX8-NEXT: s_waitcnt vmcnt(0)
817 ; GFX8-NEXT: v_add_u16_e32 v6, v14, v16
818 ; GFX8-NEXT: v_add_u16_sdwa v7, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
819 ; GFX8-NEXT: v_add_u16_e32 v8, v15, v17
820 ; GFX8-NEXT: v_add_u16_sdwa v9, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
821 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
822 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
823 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4
824 ; GFX8-NEXT: v_or_b32_e32 v7, v8, v9
825 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
826 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
827 ; GFX8-NEXT: s_waitcnt vmcnt(0)
828 ; GFX8-NEXT: s_setpc_b64 s[30:31]
830 ; GFX9-LABEL: add_v12i16:
832 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
833 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
834 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
835 ; GFX9-NEXT: global_load_dwordx2 v[14:15], v[0:1], off offset:16
836 ; GFX9-NEXT: global_load_dwordx2 v[16:17], v[2:3], off offset:16
837 ; GFX9-NEXT: s_waitcnt vmcnt(2)
838 ; GFX9-NEXT: v_pk_add_u16 v0, v10, v6
839 ; GFX9-NEXT: v_pk_add_u16 v1, v11, v7
840 ; GFX9-NEXT: v_pk_add_u16 v2, v12, v8
841 ; GFX9-NEXT: v_pk_add_u16 v3, v13, v9
842 ; GFX9-NEXT: s_waitcnt vmcnt(0)
843 ; GFX9-NEXT: v_pk_add_u16 v6, v14, v16
844 ; GFX9-NEXT: v_pk_add_u16 v7, v15, v17
845 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
846 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[6:7], off offset:16
847 ; GFX9-NEXT: s_waitcnt vmcnt(0)
848 ; GFX9-NEXT: s_setpc_b64 s[30:31]
849 %a = load <12 x i16>, ptr addrspace(1) %ptra, align 4
850 %b = load <12 x i16>, ptr addrspace(1) %ptrb, align 4
851 %add = add <12 x i16> %a, %b
852 store <12 x i16> %add, ptr addrspace(1) %ptr2, align 4
856 define <12 x i16> @add_v12i16_arg(<12 x i16> %a, <12 x i16> %b) {
857 ; GFX8-LABEL: add_v12i16_arg:
859 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
860 ; GFX8-NEXT: v_add_u16_e32 v12, v0, v6
861 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
862 ; GFX8-NEXT: v_add_u16_e32 v6, v1, v7
863 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
864 ; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
865 ; GFX8-NEXT: v_add_u16_e32 v6, v2, v8
866 ; GFX8-NEXT: v_add_u16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
867 ; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
868 ; GFX8-NEXT: v_add_u16_e32 v6, v3, v9
869 ; GFX8-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
870 ; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
871 ; GFX8-NEXT: v_add_u16_e32 v6, v4, v10
872 ; GFX8-NEXT: v_add_u16_sdwa v4, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
873 ; GFX8-NEXT: v_or_b32_e32 v4, v6, v4
874 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v11
875 ; GFX8-NEXT: v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
876 ; GFX8-NEXT: v_or_b32_e32 v0, v12, v0
877 ; GFX8-NEXT: v_or_b32_e32 v5, v6, v5
878 ; GFX8-NEXT: s_setpc_b64 s[30:31]
880 ; GFX9-LABEL: add_v12i16_arg:
882 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
883 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v6
884 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v7
885 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v8
886 ; GFX9-NEXT: v_pk_add_u16 v3, v3, v9
887 ; GFX9-NEXT: v_pk_add_u16 v4, v4, v10
888 ; GFX9-NEXT: v_pk_add_u16 v5, v5, v11
889 ; GFX9-NEXT: s_setpc_b64 s[30:31]
890 %add = add <12 x i16> %a, %b