1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
3 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo:
4 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
5 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
6 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
13 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
14 define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
16 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
18 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
19 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
20 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
22 %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
23 %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
25 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast)
26 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
30 ; Apply fneg to broadcasted vector
31 ; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_scalar_lo:
32 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
33 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
34 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
41 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
42 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
44 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
46 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
47 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
48 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
50 %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
51 %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
52 %neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %scalar0.broadcast
54 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
55 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
59 ; Apply fneg before broadcast
60 ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo:
61 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
62 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
63 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
70 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
71 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
73 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
75 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
76 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
77 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
79 %neg.scalar0 = fsub half -0.0, %scalar0
80 %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
81 %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
83 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
84 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
88 ; Apply fneg before and after broadcast, and should cancel out.
89 ; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_neg_scalar_lo:
90 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
91 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
92 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
99 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
100 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
102 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
104 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
105 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
106 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
108 %neg.scalar0 = fsub half -0.0, %scalar0
109 %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
110 %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
111 %neg.neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %neg.scalar0.broadcast
113 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast)
114 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
118 ; Add scalar, but negate low component
119 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_lo:
120 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
121 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
122 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
129 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
130 define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
132 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
134 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
135 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
136 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
138 %neg.scalar0 = fsub half -0.0, %scalar0
139 %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
140 %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1
141 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0)
142 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
146 ; Add scalar, but negate high component
147 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_hi:
148 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
149 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
150 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
157 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}}
158 define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
160 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
162 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
163 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
164 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
166 %neg.scalar0 = fsub half -0.0, %scalar0
167 %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
168 %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1
169 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0)
170 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
174 ; Apply fneg before broadcast with bitcast
175 ; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
176 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
177 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
184 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
185 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
187 %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
188 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
189 %neg.scalar0 = fsub half -0.0, %scalar0
190 %neg.scalar0.bc = bitcast half %neg.scalar0 to i16
192 %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0
193 %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer
195 %result = add <2 x i16> %vec0, %neg.scalar0.broadcast
196 store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
200 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo_neg_scalar_hi:
201 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
202 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
203 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
204 ; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
207 ; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
208 ; GCN: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]]
209 ; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
211 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}}
212 define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
214 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
215 %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
217 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
218 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
220 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
221 %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
223 %neg.scalar1 = fsub half -0.0, %scalar1
224 %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
225 %vec2 = insertelement <2 x half> %vec.ins0, half %neg.scalar1, i32 1
226 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2)
227 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
231 ; FIXME: Can we avoid waitcnt between the two halves?
232 ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
233 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
234 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
235 ; GCN: ds_read_u16 [[PACKED:v[0-9]+]]
237 ; GCN: ds_read_u16_d16_hi [[PACKED]]
239 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
240 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
242 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
243 %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
245 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
246 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
248 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
249 %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
251 %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
252 %vec2 = insertelement <2 x half> %vec.ins0, half %scalar1, i32 1
253 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
255 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2)
256 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
260 ; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi:
261 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
262 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
263 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
270 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
271 define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
273 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
274 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
276 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
277 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
278 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
280 %vec2.fneg = fsub <2 x half> <half -0.0, half -0.0>, %vec2
281 %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> <i32 1, i32 1>
283 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast)
284 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
288 ; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi:
289 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
290 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
291 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
298 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
299 define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
301 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
302 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
304 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
305 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
306 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
308 %vec2.elt1 = extractelement <2 x half> %vec2, i32 1
309 %neg.vec2.elt1 = fsub half -0.0, %vec2.elt1
311 %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1
312 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert)
313 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
317 ; GCN-LABEL: {{^}}add_vector_scalar_hi:
318 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
319 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
326 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}}
327 define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
329 %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1
331 %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
332 %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4
334 %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
335 %result = add <2 x i16> %vec0, %vec1.elt1.broadcast
337 store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
341 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi:
342 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
343 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
344 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
351 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}}
352 define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
354 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
355 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
357 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
358 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
359 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
361 %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 1>
363 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast)
365 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
369 ; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi:
370 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
371 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
372 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
379 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}}
380 define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
382 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
383 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
385 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
386 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
387 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
389 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
390 %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1
391 %neg.neg.vec2.elt1 = fsub half -0.0, %neg.vec2.elt1
392 %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1
394 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert)
395 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
399 ; GCN-LABEL: {{^}}fma_vector_vector_swap_vector:
400 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
401 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
402 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
409 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
410 define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
412 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
413 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
415 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
416 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
417 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
419 %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
420 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap)
422 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
426 ; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector:
427 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
428 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
429 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
437 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
438 define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
440 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
441 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
443 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
444 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
445 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
446 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
448 %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
449 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap)
451 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
455 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0:
456 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
457 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
458 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
466 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
467 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
469 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
470 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
472 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
473 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
474 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
475 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
476 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 0>
477 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
479 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
483 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1:
484 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
485 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
486 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
494 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}}
495 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
497 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
498 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
500 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
501 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
502 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
503 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
504 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 2, i32 1>
505 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
507 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
511 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2:
512 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
513 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
514 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
522 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
523 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
525 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
526 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
528 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
529 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
530 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
531 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
532 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 0, i32 3>
533 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
535 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
539 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3:
540 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
541 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
542 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
550 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}}
551 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
553 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
554 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
556 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
557 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
558 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
559 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
560 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 1>
561 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
563 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
567 ; GCN-LABEL: {{^}}bitcast_fneg_f32:
568 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
569 define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
571 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
572 %f32 = load volatile float, float addrspace(3)* undef, align 4
573 %neg.f32 = fsub float -0.0, %f32
574 %bc = bitcast float %neg.f32 to <2 x half>
575 %result = fadd <2 x half> %vec0, %bc
577 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
581 ; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32:
582 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}}
583 define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
585 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
587 %f32 = load volatile float, float addrspace(3)* undef, align 4
588 %neg.f32 = fsub float -0.0, %f32
589 %bc = bitcast float %neg.f32 to <2 x half>
590 %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> <i32 1, i32 0>
591 %result = fadd <2 x half> %vec0, %shuf
592 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
596 ; GCN-LABEL: {{^}}extract_from_i64:
598 ; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
599 define amdgpu_kernel void @extract_from_i64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
601 %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
602 %i64 = load volatile i64, i64 addrspace(1)* undef
604 %elt0 = trunc i64 %i64 to i16
605 %hi = lshr i64 %i64, 16
606 %elt1 = trunc i64 %hi to i16
608 %ins0 = insertelement <2 x i16> undef, i16 %elt1, i32 0
609 %ins1 = insertelement <2 x i16> %ins0, i16 %elt0, i32 1
610 %result = add <2 x i16> %vec0, %ins1
611 store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
616 ; Bitcast is final obstacle to identifying same source register
617 ; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel:
618 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
619 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
620 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
627 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
628 ; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
629 define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
631 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
632 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
634 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
635 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
636 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
638 %scalar0 = load volatile i16, i16 addrspace(1)* undef
639 %shl = shl i16 %scalar0, 1
640 %shl.bc = bitcast i16 %shl to half
642 %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
643 %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> <i32 1, i32 0>
645 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle)
646 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
651 ; Bitcast is final obstacle to identifying same source register
652 ; GCN-LABEL: {{^}}mix_elt_types_op_sel:
653 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
654 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
655 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
662 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
663 ; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
664 define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
666 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
667 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
669 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
670 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
671 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
673 %scalar0 = load volatile i16, i16 addrspace(1)* undef
674 %scalar1 = load volatile half, half addrspace(1)* undef
675 %shl = shl i16 %scalar0, 1
676 %shl.bc = bitcast i16 %shl to half
678 %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0
680 %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
681 %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> <i32 1, i32 0>
683 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1)
684 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
688 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
690 attributes #0 = { nounwind }
691 attributes #1 = { nounwind readnone }