llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll

   1 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 -asm-verbose=0 < %s | FileCheck %s
   2
   3 ;
   4 ; BFDOT
   5 ;
   6
   7 define <vscale x 4 x float> @bfdot_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
   8 ; CHECK-LABEL: bfdot_f32:
   9 ; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h
  10 ; CHECK-NEXT:  ret
  11   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
  12   ret <vscale x 4 x float> %out
  13 }
  14
  15 define <vscale x 4 x float> @bfdot_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
  16 ; CHECK-LABEL: bfdot_lane_0_f32:
  17 ; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[0]
  18 ; CHECK-NEXT:  ret
  19   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
  20   ret <vscale x 4 x float> %out
  21 }
  22
  23 define <vscale x 4 x float> @bfdot_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
  24 ; CHECK-LABEL: bfdot_lane_1_f32:
  25 ; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[1]
  26 ; CHECK-NEXT:  ret
  27   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
  28   ret <vscale x 4 x float> %out
  29 }
  30
  31 define <vscale x 4 x float> @bfdot_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
  32 ; CHECK-LABEL: bfdot_lane_2_f32:
  33 ; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[2]
  34 ; CHECK-NEXT:  ret
  35   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
  36   ret <vscale x 4 x float> %out
  37 }
  38
  39 define <vscale x 4 x float> @bfdot_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
  40 ; CHECK-LABEL: bfdot_lane_3_f32:
  41 ; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[3]
  42 ; CHECK-NEXT:  ret
  43   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
  44   ret <vscale x 4 x float> %out
  45 }
  46
  47 ;
  48 ; BFMLALB
  49 ;
  50
  51 define <vscale x 4 x float> @bfmlalb_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
  52 ; CHECK-LABEL: bfmlalb_f32:
  53 ; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h
  54 ; CHECK-NEXT:  ret
  55   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
  56   ret <vscale x 4 x float> %out
  57 }
  58
  59 define <vscale x 4 x float> @bfmlalb_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
  60 ; CHECK-LABEL: bfmlalb_lane_0_f32:
  61 ; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[0]
  62 ; CHECK-NEXT:  ret
  63   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
  64   ret <vscale x 4 x float> %out
  65 }
  66
  67 define <vscale x 4 x float> @bfmlalb_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
  68 ; CHECK-LABEL: bfmlalb_lane_1_f32:
  69 ; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[1]
  70 ; CHECK-NEXT:  ret
  71   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
  72   ret <vscale x 4 x float> %out
  73 }
  74
  75 define <vscale x 4 x float> @bfmlalb_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
  76 ; CHECK-LABEL: bfmlalb_lane_2_f32:
  77 ; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[2]
  78 ; CHECK-NEXT:  ret
  79   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
  80   ret <vscale x 4 x float> %out
  81 }
  82
  83 define <vscale x 4 x float> @bfmlalb_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
  84 ; CHECK-LABEL: bfmlalb_lane_3_f32:
  85 ; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[3]
  86 ; CHECK-NEXT:  ret
  87   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
  88   ret <vscale x 4 x float> %out
  89 }
  90
  91 define <vscale x 4 x float> @bfmlalb_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
  92 ; CHECK-LABEL: bfmlalb_lane_4_f32:
  93 ; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[4]
  94 ; CHECK-NEXT:  ret
  95   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
  96   ret <vscale x 4 x float> %out
  97 }
  98
  99 define <vscale x 4 x float> @bfmlalb_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 100 ; CHECK-LABEL: bfmlalb_lane_5_f32:
 101 ; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[5]
 102 ; CHECK-NEXT:  ret
 103   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
 104   ret <vscale x 4 x float> %out
 105 }
 106
 107 define <vscale x 4 x float> @bfmlalb_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 108 ; CHECK-LABEL: bfmlalb_lane_6_f32:
 109 ; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[6]
 110 ; CHECK-NEXT:  ret
 111   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
 112   ret <vscale x 4 x float> %out
 113 }
 114
 115 define <vscale x 4 x float> @bfmlalb_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 116 ; CHECK-LABEL: bfmlalb_lane_7_f32:
 117 ; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[7]
 118 ; CHECK-NEXT:  ret
 119   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
 120   ret <vscale x 4 x float> %out
 121 }
 122
 123 ;
 124 ; BFMLALT
 125 ;
 126
 127 define <vscale x 4 x float> @bfmlalt_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 128 ; CHECK-LABEL: bfmlalt_f32:
 129 ; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h
 130 ; CHECK-NEXT:  ret
 131   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
 132   ret <vscale x 4 x float> %out
 133 }
 134
 135 define <vscale x 4 x float> @bfmlalt_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 136 ; CHECK-LABEL: bfmlalt_lane_0_f32:
 137 ; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[0]
 138 ; CHECK-NEXT:  ret
 139   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
 140   ret <vscale x 4 x float> %out
 141 }
 142
 143 define <vscale x 4 x float> @bfmlalt_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 144 ; CHECK-LABEL: bfmlalt_lane_1_f32:
 145 ; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[1]
 146 ; CHECK-NEXT:  ret
 147   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
 148   ret <vscale x 4 x float> %out
 149 }
 150
 151 define <vscale x 4 x float> @bfmlalt_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 152 ; CHECK-LABEL: bfmlalt_lane_2_f32:
 153 ; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[2]
 154 ; CHECK-NEXT:  ret
 155   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
 156   ret <vscale x 4 x float> %out
 157 }
 158
 159 define <vscale x 4 x float> @bfmlalt_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 160 ; CHECK-LABEL: bfmlalt_lane_3_f32:
 161 ; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[3]
 162 ; CHECK-NEXT:  ret
 163   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
 164   ret <vscale x 4 x float> %out
 165 }
 166
 167 define <vscale x 4 x float> @bfmlalt_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 168 ; CHECK-LABEL: bfmlalt_lane_4_f32:
 169 ; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[4]
 170 ; CHECK-NEXT:  ret
 171   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
 172   ret <vscale x 4 x float> %out
 173 }
 174
 175 define <vscale x 4 x float> @bfmlalt_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 176 ; CHECK-LABEL: bfmlalt_lane_5_f32:
 177 ; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[5]
 178 ; CHECK-NEXT:  ret
 179   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
 180   ret <vscale x 4 x float> %out
 181 }
 182
 183 define <vscale x 4 x float> @bfmlalt_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 184 ; CHECK-LABEL: bfmlalt_lane_6_f32:
 185 ; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[6]
 186 ; CHECK-NEXT:  ret
 187   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
 188   ret <vscale x 4 x float> %out
 189 }
 190
 191 define <vscale x 4 x float> @bfmlalt_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 192 ; CHECK-LABEL: bfmlalt_lane_7_f32:
 193 ; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[7]
 194 ; CHECK-NEXT:  ret
 195   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
 196   ret <vscale x 4 x float> %out
 197 }
 198
 199 ;
 200 ; BFMMLA
 201 ;
 202
 203 define <vscale x 4 x float> @bfmmla_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
 204 ; CHECK-LABEL: bfmmla_f32:
 205 ; CHECK-NEXT:  bfmmla z0.s, z1.h, z2.h
 206 ; CHECK-NEXT:  ret
 207   %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
 208   ret <vscale x 4 x float> %out
 209 }
 210
 211 ;
 212 ; BFCVT
 213 ;
 214
 215 define <vscale x 8 x bfloat> @fcvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {
 216 ; CHECK-LABEL: fcvt_bf16_f32:
 217 ; CHECK-NEXT: bfcvt z0.h, p0/m, z1.s
 218 ; CHECK-NEXT: ret
 219   %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
 220   ret <vscale x 8 x bfloat> %out
 221 }
 222
 223 ;
 224 ; BFCVTNT
 225 ;
 226
 227 define <vscale x 8 x bfloat> @fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {
 228 ; CHECK-LABEL: fcvtnt_bf16_f32:
 229 ; CHECK-NEXT: bfcvtnt z0.h, p0/m, z1.s
 230 ; CHECK-NEXT: ret
 231   %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
 232   ret <vscale x 8 x bfloat> %out
 233 }
 234
 235 declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 236 declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
 237 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 238 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
 239 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 240 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
 241 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 242 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
 243 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)