1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6 define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) {
7 ; GFX9-LABEL: v_fmul_v2f16:
9 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
11 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13 ; GFX8-LABEL: v_fmul_v2f16:
15 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
17 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
18 ; GFX8-NEXT: v_mov_b32_e32 v1, 16
19 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
20 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
21 ; GFX8-NEXT: s_setpc_b64 s[30:31]
23 ; GFX10-LABEL: v_fmul_v2f16:
25 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
27 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
28 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29 %mul = fmul <2 x half> %a, %b
33 define <2 x half> @v_fmul_v2f16_fneg_lhs(<2 x half> %a, <2 x half> %b) {
34 ; GFX9-LABEL: v_fmul_v2f16_fneg_lhs:
36 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
38 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40 ; GFX8-LABEL: v_fmul_v2f16_fneg_lhs:
42 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
43 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
44 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
45 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
46 ; GFX8-NEXT: v_mov_b32_e32 v1, 16
47 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
48 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
49 ; GFX8-NEXT: s_setpc_b64 s[30:31]
51 ; GFX10-LABEL: v_fmul_v2f16_fneg_lhs:
53 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
55 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
56 ; GFX10-NEXT: s_setpc_b64 s[30:31]
57 %neg.a = fneg <2 x half> %a
58 %mul = fmul <2 x half> %neg.a, %b
62 define <2 x half> @v_fmul_v2f16_fneg_rhs(<2 x half> %a, <2 x half> %b) {
63 ; GFX9-LABEL: v_fmul_v2f16_fneg_rhs:
65 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
67 ; GFX9-NEXT: s_setpc_b64 s[30:31]
69 ; GFX8-LABEL: v_fmul_v2f16_fneg_rhs:
71 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
73 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
74 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
75 ; GFX8-NEXT: v_mov_b32_e32 v1, 16
76 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
77 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
78 ; GFX8-NEXT: s_setpc_b64 s[30:31]
80 ; GFX10-LABEL: v_fmul_v2f16_fneg_rhs:
82 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
84 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
85 ; GFX10-NEXT: s_setpc_b64 s[30:31]
86 %neg.b = fneg <2 x half> %b
87 %mul = fmul <2 x half> %a, %neg.b
91 define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
92 ; GFX9-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
94 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
96 ; GFX9-NEXT: s_setpc_b64 s[30:31]
98 ; GFX8-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
100 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
102 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
103 ; GFX8-NEXT: v_mov_b32_e32 v1, 16
104 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
105 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
106 ; GFX8-NEXT: s_setpc_b64 s[30:31]
108 ; GFX10-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
110 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
112 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
113 ; GFX10-NEXT: s_setpc_b64 s[30:31]
114 %neg.a = fneg <2 x half> %a
115 %neg.b = fneg <2 x half> %b
116 %mul = fmul <2 x half> %neg.a, %neg.b
121 ; define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) {
122 ; %mul = fmul <3 x half> %a, %b
123 ; ret <3 x half> %mul
126 ; define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) {
127 ; %neg.a = fneg <3 x half> %a
128 ; %mul = fmul <3 x half> %neg.a, %b
129 ; ret <3 x half> %mul
132 ; define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) {
133 ; %neg.b = fneg <3 x half> %b
134 ; %mul = fmul <3 x half> %a, %neg.b
135 ; ret <3 x half> %mul
138 ; define <3 x half> @v_fmul_v3f16_fneg_lhs_fneg_rhs(<3 x half> %a, <3 x half> %b) {
139 ; %neg.a = fneg <3 x half> %a
140 ; %neg.b = fneg <3 x half> %b
141 ; %mul = fmul <3 x half> %neg.a, %neg.b
142 ; ret <3 x half> %mul
145 define <4 x half> @v_fmul_v4f16(<4 x half> %a, <4 x half> %b) {
146 ; GFX9-LABEL: v_fmul_v4f16:
148 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
150 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
151 ; GFX9-NEXT: s_setpc_b64 s[30:31]
153 ; GFX8-LABEL: v_fmul_v4f16:
155 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
157 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
158 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
159 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
160 ; GFX8-NEXT: v_mov_b32_e32 v3, 16
161 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
162 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
163 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
164 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
165 ; GFX8-NEXT: s_setpc_b64 s[30:31]
167 ; GFX10-LABEL: v_fmul_v4f16:
169 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
170 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
171 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
172 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
173 ; GFX10-NEXT: s_setpc_b64 s[30:31]
174 %mul = fmul <4 x half> %a, %b
178 define <4 x half> @v_fmul_v4f16_fneg_lhs(<4 x half> %a, <4 x half> %b) {
179 ; GFX9-LABEL: v_fmul_v4f16_fneg_lhs:
181 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
183 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
184 ; GFX9-NEXT: s_setpc_b64 s[30:31]
186 ; GFX8-LABEL: v_fmul_v4f16_fneg_lhs:
188 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189 ; GFX8-NEXT: s_mov_b32 s4, 0x80008000
190 ; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
191 ; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
192 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
193 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
194 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
195 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
196 ; GFX8-NEXT: v_mov_b32_e32 v3, 16
197 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
198 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
199 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
200 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
201 ; GFX8-NEXT: s_setpc_b64 s[30:31]
203 ; GFX10-LABEL: v_fmul_v4f16_fneg_lhs:
205 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
206 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
207 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
208 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
209 ; GFX10-NEXT: s_setpc_b64 s[30:31]
210 %neg.a = fneg <4 x half> %a
211 %mul = fmul <4 x half> %neg.a, %b
215 define <4 x half> @v_fmul_v4f16_fneg_rhs(<4 x half> %a, <4 x half> %b) {
216 ; GFX9-LABEL: v_fmul_v4f16_fneg_rhs:
218 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
220 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
221 ; GFX9-NEXT: s_setpc_b64 s[30:31]
223 ; GFX8-LABEL: v_fmul_v4f16_fneg_rhs:
225 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
226 ; GFX8-NEXT: s_mov_b32 s4, 0x80008000
227 ; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2
228 ; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3
229 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
230 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
231 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
232 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
233 ; GFX8-NEXT: v_mov_b32_e32 v3, 16
234 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
235 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
236 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
237 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
238 ; GFX8-NEXT: s_setpc_b64 s[30:31]
240 ; GFX10-LABEL: v_fmul_v4f16_fneg_rhs:
242 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
244 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
245 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
246 ; GFX10-NEXT: s_setpc_b64 s[30:31]
247 %neg.b = fneg <4 x half> %b
248 %mul = fmul <4 x half> %a, %neg.b
252 define <4 x half> @v_fmul_v4f16_fneg_lhs_fneg_rhs(<4 x half> %a, <4 x half> %b) {
253 ; GFX9-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
255 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
256 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
257 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
258 ; GFX9-NEXT: s_setpc_b64 s[30:31]
260 ; GFX8-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
262 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
263 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
264 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
265 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
266 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
267 ; GFX8-NEXT: v_mov_b32_e32 v3, 16
268 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
269 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
270 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
271 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
272 ; GFX8-NEXT: s_setpc_b64 s[30:31]
274 ; GFX10-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
276 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
278 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
279 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
280 ; GFX10-NEXT: s_setpc_b64 s[30:31]
281 %neg.a = fneg <4 x half> %a
282 %neg.b = fneg <4 x half> %b
283 %mul = fmul <4 x half> %neg.a, %neg.b
287 define <6 x half> @v_fmul_v6f16(<6 x half> %a, <6 x half> %b) {
288 ; GFX9-LABEL: v_fmul_v6f16:
290 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
291 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3
292 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4
293 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5
294 ; GFX9-NEXT: s_setpc_b64 s[30:31]
296 ; GFX8-LABEL: v_fmul_v6f16:
298 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
300 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
301 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
302 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
303 ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
304 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
305 ; GFX8-NEXT: v_mov_b32_e32 v5, 16
306 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
307 ; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
308 ; GFX8-NEXT: v_mov_b32_e32 v3, 16
309 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
310 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
311 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
312 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
313 ; GFX8-NEXT: s_setpc_b64 s[30:31]
315 ; GFX10-LABEL: v_fmul_v6f16:
317 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
319 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v3
320 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v4
321 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v5
322 ; GFX10-NEXT: s_setpc_b64 s[30:31]
323 %mul = fmul <6 x half> %a, %b
327 define <6 x half> @v_fmul_v6f16_fneg_lhs(<6 x half> %a, <6 x half> %b) {
328 ; GFX9-LABEL: v_fmul_v6f16_fneg_lhs:
330 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
331 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[1,0] neg_hi:[1,0]
332 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[1,0] neg_hi:[1,0]
333 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[1,0] neg_hi:[1,0]
334 ; GFX9-NEXT: s_setpc_b64 s[30:31]
336 ; GFX8-LABEL: v_fmul_v6f16_fneg_lhs:
338 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
339 ; GFX8-NEXT: s_mov_b32 s4, 0x80008000
340 ; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
341 ; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
342 ; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2
343 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
344 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
345 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
346 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
347 ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
348 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
349 ; GFX8-NEXT: v_mov_b32_e32 v5, 16
350 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
351 ; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
352 ; GFX8-NEXT: v_mov_b32_e32 v3, 16
353 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
354 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
355 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
356 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
357 ; GFX8-NEXT: s_setpc_b64 s[30:31]
359 ; GFX10-LABEL: v_fmul_v6f16_fneg_lhs:
361 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
363 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[1,0] neg_hi:[1,0]
364 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[1,0] neg_hi:[1,0]
365 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[1,0] neg_hi:[1,0]
366 ; GFX10-NEXT: s_setpc_b64 s[30:31]
367 %neg.a = fneg <6 x half> %a
368 %mul = fmul <6 x half> %neg.a, %b
372 define <6 x half> @v_fmul_v6f16_fneg_rhs(<6 x half> %a, <6 x half> %b) {
373 ; GFX9-LABEL: v_fmul_v6f16_fneg_rhs:
375 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[0,1] neg_hi:[0,1]
377 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
378 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
379 ; GFX9-NEXT: s_setpc_b64 s[30:31]
381 ; GFX8-LABEL: v_fmul_v6f16_fneg_rhs:
383 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
384 ; GFX8-NEXT: s_mov_b32 s4, 0x80008000
385 ; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3
386 ; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4
387 ; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5
388 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
389 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
390 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
391 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
392 ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
393 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
394 ; GFX8-NEXT: v_mov_b32_e32 v5, 16
395 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
396 ; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
397 ; GFX8-NEXT: v_mov_b32_e32 v3, 16
398 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
399 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
400 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
401 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
402 ; GFX8-NEXT: s_setpc_b64 s[30:31]
404 ; GFX10-LABEL: v_fmul_v6f16_fneg_rhs:
406 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
407 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
408 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[0,1] neg_hi:[0,1]
409 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
410 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
411 ; GFX10-NEXT: s_setpc_b64 s[30:31]
412 %neg.b = fneg <6 x half> %b
413 %mul = fmul <6 x half> %a, %neg.b
417 define <6 x half> @v_fmul_v6f16_fneg_lhs_fneg_rhs(<6 x half> %a, <6 x half> %b) {
418 ; GFX9-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
420 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3
422 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4
423 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5
424 ; GFX9-NEXT: s_setpc_b64 s[30:31]
426 ; GFX8-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
428 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
429 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
430 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
431 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
432 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
433 ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
434 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
435 ; GFX8-NEXT: v_mov_b32_e32 v5, 16
436 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
437 ; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
438 ; GFX8-NEXT: v_mov_b32_e32 v3, 16
439 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
440 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
441 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
442 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
443 ; GFX8-NEXT: s_setpc_b64 s[30:31]
445 ; GFX10-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
447 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
448 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
449 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v3
450 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v4
451 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v5
452 ; GFX10-NEXT: s_setpc_b64 s[30:31]
453 %neg.a = fneg <6 x half> %a
454 %neg.b = fneg <6 x half> %b
455 %mul = fmul <6 x half> %neg.a, %neg.b
459 define <8 x half> @v_fmul_v8f16(<8 x half> %a, <8 x half> %b) {
460 ; GFX9-LABEL: v_fmul_v8f16:
462 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
464 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5
465 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6
466 ; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7
467 ; GFX9-NEXT: s_setpc_b64 s[30:31]
469 ; GFX8-LABEL: v_fmul_v8f16:
471 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
472 ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
473 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
474 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
475 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
476 ; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
477 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
478 ; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
479 ; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
480 ; GFX8-NEXT: v_mov_b32_e32 v7, 16
481 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
482 ; GFX8-NEXT: v_mov_b32_e32 v7, 16
483 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
484 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
485 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
486 ; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
487 ; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
488 ; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
489 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
490 ; GFX8-NEXT: s_setpc_b64 s[30:31]
492 ; GFX10-LABEL: v_fmul_v8f16:
494 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
495 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
496 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4
497 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5
498 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6
499 ; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7
500 ; GFX10-NEXT: s_setpc_b64 s[30:31]
501 %mul = fmul <8 x half> %a, %b
505 define <8 x half> @v_fmul_v8f16_fneg_lhs(<8 x half> %a, <8 x half> %b) {
506 ; GFX9-LABEL: v_fmul_v8f16_fneg_lhs:
508 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
509 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[1,0] neg_hi:[1,0]
510 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[1,0] neg_hi:[1,0]
511 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[1,0] neg_hi:[1,0]
512 ; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[1,0] neg_hi:[1,0]
513 ; GFX9-NEXT: s_setpc_b64 s[30:31]
515 ; GFX8-LABEL: v_fmul_v8f16_fneg_lhs:
517 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
518 ; GFX8-NEXT: s_mov_b32 s4, 0x80008000
519 ; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
520 ; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
521 ; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2
522 ; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3
523 ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
524 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
525 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
526 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
527 ; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
528 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
529 ; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
530 ; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
531 ; GFX8-NEXT: v_mov_b32_e32 v7, 16
532 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
533 ; GFX8-NEXT: v_mov_b32_e32 v7, 16
534 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
535 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
536 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
537 ; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
538 ; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
539 ; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
540 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
541 ; GFX8-NEXT: s_setpc_b64 s[30:31]
543 ; GFX10-LABEL: v_fmul_v8f16_fneg_lhs:
545 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
546 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
547 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[1,0] neg_hi:[1,0]
548 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[1,0] neg_hi:[1,0]
549 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[1,0] neg_hi:[1,0]
550 ; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[1,0] neg_hi:[1,0]
551 ; GFX10-NEXT: s_setpc_b64 s[30:31]
552 %neg.a = fneg <8 x half> %a
553 %mul = fmul <8 x half> %neg.a, %b
557 define <8 x half> @v_fmul_v8f16_fneg_rhs(<8 x half> %a, <8 x half> %b) {
558 ; GFX9-LABEL: v_fmul_v8f16_fneg_rhs:
560 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
561 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
562 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
563 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1]
564 ; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1]
565 ; GFX9-NEXT: s_setpc_b64 s[30:31]
567 ; GFX8-LABEL: v_fmul_v8f16_fneg_rhs:
569 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570 ; GFX8-NEXT: s_mov_b32 s4, 0x80008000
571 ; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4
572 ; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5
573 ; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6
574 ; GFX8-NEXT: v_xor_b32_e32 v7, s4, v7
575 ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
576 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
577 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
578 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
579 ; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
580 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
581 ; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
582 ; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
583 ; GFX8-NEXT: v_mov_b32_e32 v7, 16
584 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
585 ; GFX8-NEXT: v_mov_b32_e32 v7, 16
586 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
587 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
588 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
589 ; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
590 ; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
591 ; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
592 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
593 ; GFX8-NEXT: s_setpc_b64 s[30:31]
595 ; GFX10-LABEL: v_fmul_v8f16_fneg_rhs:
597 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
599 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
600 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
601 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1]
602 ; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1]
603 ; GFX10-NEXT: s_setpc_b64 s[30:31]
604 %neg.b = fneg <8 x half> %b
605 %mul = fmul <8 x half> %a, %neg.b
609 define <8 x half> @v_fmul_v8f16_fneg_lhs_fneg_rhs(<8 x half> %a, <8 x half> %b) {
610 ; GFX9-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
612 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
613 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
614 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5
615 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6
616 ; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7
617 ; GFX9-NEXT: s_setpc_b64 s[30:31]
619 ; GFX8-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
621 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
622 ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
623 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
624 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
625 ; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
626 ; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
627 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
628 ; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
629 ; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
630 ; GFX8-NEXT: v_mov_b32_e32 v7, 16
631 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
632 ; GFX8-NEXT: v_mov_b32_e32 v7, 16
633 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
634 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
635 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
636 ; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
637 ; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
638 ; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
639 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
640 ; GFX8-NEXT: s_setpc_b64 s[30:31]
642 ; GFX10-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
644 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
645 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
646 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4
647 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5
648 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6
649 ; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7
650 ; GFX10-NEXT: s_setpc_b64 s[30:31]
651 %neg.a = fneg <8 x half> %a
652 %neg.b = fneg <8 x half> %b
653 %mul = fmul <8 x half> %neg.a, %neg.b