llvm/test/CodeGen/AMDGPU/packed-op-sel.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
   2
   3 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo:
   4 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
   5 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
   6 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
   7
   8 ; GCN-NOT: pack
   9 ; GCN-NOT: and
  10 ; GCN-NOT: shl
  11 ; GCN-NOT: or
  12
  13 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
  14 define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
  15 bb:
  16   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
  17
  18   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
  19   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
  20   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
  21
  22   %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
  23   %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
  24
  25   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast)
  26   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
  27   ret void
  28 }
  29
  30 ; Apply fneg to broadcasted vector
  31 ; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_scalar_lo:
  32 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
  33 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
  34 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
  35
  36 ; GCN-NOT: pack
  37 ; GCN-NOT: and
  38 ; GCN-NOT: shl
  39 ; GCN-NOT: or
  40
  41 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
  42 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
  43 bb:
  44   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
  45
  46   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
  47   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
  48   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
  49
  50   %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
  51   %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
  52   %neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %scalar0.broadcast
  53
  54   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
  55   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
  56   ret void
  57 }
  58
  59 ; Apply fneg before broadcast
  60 ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo:
  61 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
  62 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
  63 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
  64
  65 ; GCN-NOT: pack
  66 ; GCN-NOT: and
  67 ; GCN-NOT: shl
  68 ; GCN-NOT: or
  69
  70 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
  71 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
  72 bb:
  73   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
  74
  75   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
  76   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
  77   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
  78
  79   %neg.scalar0 = fsub half -0.0, %scalar0
  80   %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
  81   %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
  82
  83   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
  84   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
  85   ret void
  86 }
  87
  88 ; Apply fneg before and after broadcast, and should cancel out.
  89 ; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_neg_scalar_lo:
  90 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
  91 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
  92 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
  93
  94 ; GCN-NOT: pack
  95 ; GCN-NOT: and
  96 ; GCN-NOT: shl
  97 ; GCN-NOT: or
  98
  99 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
 100 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
 101 bb:
 102   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 103
 104   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 105   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 106   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
 107
 108   %neg.scalar0 = fsub half -0.0, %scalar0
 109   %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
 110   %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
 111   %neg.neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %neg.scalar0.broadcast
 112
 113   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast)
 114   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 115   ret void
 116 }
 117
 118 ; Add scalar, but negate low component
 119 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_lo:
 120 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 121 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 122 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
 123
 124 ; GCN-NOT: pack
 125 ; GCN-NOT: and
 126 ; GCN-NOT: shl
 127 ; GCN-NOT: or
 128
 129 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
 130 define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
 131 bb:
 132   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 133
 134   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 135   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 136   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
 137
 138   %neg.scalar0 = fsub half -0.0, %scalar0
 139   %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
 140   %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1
 141   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0)
 142   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 143   ret void
 144 }
 145
 146 ; Add scalar, but negate high component
 147 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_hi:
 148 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 149 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 150 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
 151
 152 ; GCN-NOT: pack
 153 ; GCN-NOT: and
 154 ; GCN-NOT: shl
 155 ; GCN-NOT: or
 156
 157 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}}
 158 define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
 159 bb:
 160   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 161
 162   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 163   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 164   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
 165
 166   %neg.scalar0 = fsub half -0.0, %scalar0
 167   %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
 168   %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1
 169   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0)
 170   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 171   ret void
 172 }
 173
 174 ; Apply fneg before broadcast with bitcast
 175 ; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
 176 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 177 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
 178
 179 ; GCN-NOT: pack
 180 ; GCN-NOT: and
 181 ; GCN-NOT: shl
 182 ; GCN-NOT: or
 183
 184 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
 185 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
 186 bb:
 187   %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
 188   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
 189   %neg.scalar0 = fsub half -0.0, %scalar0
 190   %neg.scalar0.bc = bitcast half %neg.scalar0 to i16
 191
 192   %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0
 193   %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer
 194
 195   %result = add <2 x i16> %vec0, %neg.scalar0.broadcast
 196   store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
 197   ret void
 198 }
 199
 200 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo_neg_scalar_hi:
 201 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 202 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 203 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
 204 ; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
 205
 206 ; FIXME: Remove and
 207 ; GCN-DAG: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
 208 ; GCN-DAG: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]]
 209 ; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
 210
 211 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}}
 212 define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
 213 bb:
 214   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 215   %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
 216
 217   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 218   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 219
 220   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
 221   %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
 222
 223   %neg.scalar1 = fsub half -0.0, %scalar1
 224   %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
 225   %vec2 = insertelement <2 x half> %vec.ins0, half %neg.scalar1, i32 1
 226   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2)
 227   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 228   ret void
 229 }
 230
 231 ; FIXME: Can we avoid waitcnt between the two halves?
 232 ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
 233 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 234 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 235 ; GCN: ds_read_u16 [[PACKED:v[0-9]+]]
 236 ; GCN: s_waitcnt
 237 ; GCN: ds_read_u16_d16_hi [[PACKED]]
 238
 239 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
 240 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
 241 bb:
 242   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 243   %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
 244
 245   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 246   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 247
 248   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
 249   %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
 250
 251   %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
 252   %vec2 = insertelement <2 x half> %vec.ins0, half %scalar1, i32 1
 253   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
 254
 255   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2)
 256   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 257   ret void
 258 }
 259
 260 ; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi:
 261 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 262 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 263 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 264
 265 ; GCN-NOT: pack
 266 ; GCN-NOT: and
 267 ; GCN-NOT: shl
 268 ; GCN-NOT: or
 269
 270 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
 271 define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 272 bb:
 273   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 274   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 275
 276   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 277   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 278   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 279
 280   %vec2.fneg = fsub <2 x half> <half -0.0, half -0.0>, %vec2
 281   %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 282
 283   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast)
 284   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 285   ret void
 286 }
 287
 288 ; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi:
 289 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 290 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 291 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 292
 293 ; GCN-NOT: pack
 294 ; GCN-NOT: and
 295 ; GCN-NOT: shl
 296 ; GCN-NOT: or
 297
 298 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
 299 define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 300 bb:
 301   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 302   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 303
 304   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 305   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 306   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 307
 308   %vec2.elt1 = extractelement <2 x half> %vec2, i32 1
 309   %neg.vec2.elt1 = fsub half -0.0, %vec2.elt1
 310
 311   %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1
 312   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert)
 313   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 314   ret void
 315 }
 316
 317 ; GCN-LABEL: {{^}}add_vector_scalar_hi:
 318 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 319 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 320
 321 ; GCN-NOT: pack
 322 ; GCN-NOT: and
 323 ; GCN-NOT: shl
 324 ; GCN-NOT: or
 325
 326 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}}
 327 define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
 328 bb:
 329   %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1
 330
 331   %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
 332   %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4
 333
 334   %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 335   %result = add <2 x i16> %vec0, %vec1.elt1.broadcast
 336
 337   store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
 338   ret void
 339 }
 340
 341 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi:
 342 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 343 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 344 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 345
 346 ; GCN-NOT: pack
 347 ; GCN-NOT: and
 348 ; GCN-NOT: shl
 349 ; GCN-NOT: or
 350
 351 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}}
 352 define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 353 bb:
 354   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 355   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 356
 357   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 358   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 359   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 360
 361   %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 362
 363   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast)
 364
 365   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 366   ret void
 367 }
 368
 369 ; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi:
 370 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 371 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 372 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 373
 374 ; GCN-NOT: pack
 375 ; GCN-NOT: and
 376 ; GCN-NOT: shl
 377 ; GCN-NOT: or
 378
 379 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}}
 380 define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 381 bb:
 382   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 383   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 384
 385   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 386   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 387   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 388
 389   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
 390   %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1
 391   %neg.neg.vec2.elt1 = fsub half -0.0, %neg.vec2.elt1
 392   %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1
 393
 394   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert)
 395   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 396   ret void
 397 }
 398
 399 ; GCN-LABEL: {{^}}fma_vector_vector_swap_vector:
 400 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 401 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 402 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 403
 404 ; GCN-NOT: pack
 405 ; GCN-NOT: and
 406 ; GCN-NOT: shl
 407 ; GCN-NOT: or
 408
 409 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
 410 define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 411 bb:
 412   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 413   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 414
 415   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 416   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 417   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 418
 419   %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 420   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap)
 421
 422   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 423   ret void
 424 }
 425
 426 ; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector:
 427 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 428 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 429 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 430
 431 ; GCN-NOT: pack
 432 ; GCN-NOT: and
 433 ; GCN-NOT: shl
 434 ; GCN-NOT: or
 435 ; GCN-NOT: xor
 436
 437 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
 438 define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 439 bb:
 440   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 441   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 442
 443   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 444   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 445   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 446   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
 447
 448   %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 449   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap)
 450
 451   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 452   ret void
 453 }
 454
 455 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0:
 456 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 457 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 458 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 459
 460 ; GCN-NOT: pack
 461 ; GCN-NOT: and
 462 ; GCN-NOT: shl
 463 ; GCN-NOT: or
 464 ; GCN-NOT: xor
 465
 466 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
 467 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 468 bb:
 469   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 470   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 471
 472   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 473   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 474   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 475   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
 476   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 0>
 477   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
 478
 479   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 480   ret void
 481 }
 482
 483 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1:
 484 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 485 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 486 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 487
 488 ; GCN-NOT: pack
 489 ; GCN-NOT: and
 490 ; GCN-NOT: shl
 491 ; GCN-NOT: or
 492 ; GCN-NOT: xor
 493
 494 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}}
 495 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 496 bb:
 497   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 498   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 499
 500   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 501   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 502   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 503   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
 504   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 2, i32 1>
 505   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
 506
 507   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 508   ret void
 509 }
 510
 511 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2:
 512 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 513 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 514 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 515
 516 ; GCN-NOT: pack
 517 ; GCN-NOT: and
 518 ; GCN-NOT: shl
 519 ; GCN-NOT: or
 520 ; GCN-NOT: xor
 521
 522 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
 523 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 524 bb:
 525   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 526   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 527
 528   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 529   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 530   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 531   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
 532   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 0, i32 3>
 533   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
 534
 535   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 536   ret void
 537 }
 538
 539 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3:
 540 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 541 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 542 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 543
 544 ; GCN-NOT: pack
 545 ; GCN-NOT: and
 546 ; GCN-NOT: shl
 547 ; GCN-NOT: or
 548 ; GCN-NOT: xor
 549
 550 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}}
 551 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 552 bb:
 553   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 554   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 555
 556   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 557   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 558   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 559   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
 560   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 1>
 561   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
 562
 563   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 564   ret void
 565 }
 566
 567 ; GCN-LABEL: {{^}}bitcast_fneg_f32:
 568 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
 569 define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 570 bb:
 571   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 572   %f32 = load volatile float, float addrspace(3)* undef, align 4
 573   %neg.f32 = fsub float -0.0, %f32
 574   %bc = bitcast float %neg.f32 to <2 x half>
 575   %result = fadd <2 x half> %vec0, %bc
 576
 577   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 578   ret void
 579 }
 580
 581 ; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32:
 582 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}}
 583 define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 584 bb:
 585   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 586
 587   %f32 = load volatile float, float addrspace(3)* undef, align 4
 588   %neg.f32 = fsub float -0.0, %f32
 589   %bc = bitcast float %neg.f32 to <2 x half>
 590   %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 591   %result = fadd <2 x half> %vec0, %shuf
 592   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 593   ret void
 594 }
 595
 596 ; GCN-LABEL: {{^}}extract_from_i64:
 597 ; GCN: v_lshl_or_b32
 598 ; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
 599 define amdgpu_kernel void @extract_from_i64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
 600 bb:
 601   %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
 602   %i64 = load volatile i64, i64 addrspace(1)* undef
 603
 604   %elt0 = trunc i64 %i64 to i16
 605   %hi = lshr i64 %i64, 16
 606   %elt1 = trunc i64 %hi to i16
 607
 608   %ins0 = insertelement <2 x i16> undef, i16 %elt1, i32 0
 609   %ins1 = insertelement <2 x i16> %ins0, i16 %elt0, i32 1
 610   %result = add <2 x i16> %vec0, %ins1
 611   store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
 612   ret void
 613 }
 614
 615
 616 ; Bitcast is final obstacle to identifying same source register
 617 ; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel:
 618 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 619 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 620 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 621
 622 ; GCN-NOT: pack
 623 ; GCN-NOT: and
 624 ; GCN-NOT: shl
 625 ; GCN-NOT: _or
 626
 627 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
 628 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
 629 define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 630 bb:
 631   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 632   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 633
 634   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 635   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 636   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 637
 638   %scalar0 = load volatile i16, i16 addrspace(1)* undef
 639   %shl = shl i16 %scalar0, 1
 640   %shl.bc = bitcast i16 %shl to half
 641
 642   %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
 643   %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> <i32 1, i32 0>
 644
 645   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle)
 646   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 647   ret void
 648 }
 649
 650
 651 ; Bitcast is final obstacle to identifying same source register
 652 ; GCN-LABEL: {{^}}mix_elt_types_op_sel:
 653 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
 654 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
 655 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
 656
 657 ; GCN-NOT: pack
 658 ; GCN-NOT: and
 659 ; GCN-NOT: shl
 660 ; GCN-NOT: _or
 661
 662 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
 663 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
 664 define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
 665 bb:
 666   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
 667   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
 668
 669   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
 670   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
 671   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
 672
 673   %scalar0 = load volatile i16, i16 addrspace(1)* undef
 674   %scalar1 = load volatile half, half addrspace(1)* undef
 675   %shl = shl i16 %scalar0, 1
 676   %shl.bc = bitcast i16 %shl to half
 677
 678   %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0
 679
 680   %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
 681   %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> <i32 1, i32 0>
 682
 683   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1)
 684   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
 685   ret void
 686 }
 687
 688 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
 689
 690 attributes #0 = { nounwind }
 691 attributes #1 = { nounwind readnone }