llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
   3
   4 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
   5 ; CHECK-LABEL: test_vmulq_u8:
   6 ; CHECK:       @ %bb.0: @ %entry
   7 ; CHECK-NEXT:    vmul.i8 q0, q1, q0
   8 ; CHECK-NEXT:    bx lr
   9 entry:
  10   %0 = mul <16 x i8> %b, %a
  11   ret <16 x i8> %0
  12 }
  13
  14 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
  15 ; CHECK-LABEL: test_vmulq_s16:
  16 ; CHECK:       @ %bb.0: @ %entry
  17 ; CHECK-NEXT:    vmul.i16 q0, q1, q0
  18 ; CHECK-NEXT:    bx lr
  19 entry:
  20   %0 = mul <8 x i16> %b, %a
  21   ret <8 x i16> %0
  22 }
  23
  24 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
  25 ; CHECK-LABEL: test_vmulq_u32:
  26 ; CHECK:       @ %bb.0: @ %entry
  27 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
  28 ; CHECK-NEXT:    bx lr
  29 entry:
  30   %0 = mul <4 x i32> %b, %a
  31   ret <4 x i32> %0
  32 }
  33
  34 define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
  35 ; CHECK-LABEL: test_vmulq_f32:
  36 ; CHECK:       @ %bb.0: @ %entry
  37 ; CHECK-NEXT:    vmul.f32 q0, q0, q1
  38 ; CHECK-NEXT:    bx lr
  39 entry:
  40   %0 = fmul <4 x float> %a, %b
  41   ret <4 x float> %0
  42 }
  43
  44 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
  45 ; CHECK-LABEL: test_vmulq_m_s8:
  46 ; CHECK:       @ %bb.0: @ %entry
  47 ; CHECK-NEXT:    vmsr p0, r0
  48 ; CHECK-NEXT:    vpst
  49 ; CHECK-NEXT:    vmult.i8 q0, q1, q2
  50 ; CHECK-NEXT:    bx lr
  51 entry:
  52   %0 = zext i16 %p to i32
  53   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
  54   %2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
  55   ret <16 x i8> %2
  56 }
  57
  58 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
  59
  60 declare <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
  61
  62 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
  63 ; CHECK-LABEL: test_vmulq_m_u16:
  64 ; CHECK:       @ %bb.0: @ %entry
  65 ; CHECK-NEXT:    vmsr p0, r0
  66 ; CHECK-NEXT:    vpst
  67 ; CHECK-NEXT:    vmult.i16 q0, q1, q2
  68 ; CHECK-NEXT:    bx lr
  69 entry:
  70   %0 = zext i16 %p to i32
  71   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  72   %2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
  73   ret <8 x i16> %2
  74 }
  75
  76 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
  77
  78 declare <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
  79
  80 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
  81 ; CHECK-LABEL: test_vmulq_m_s32:
  82 ; CHECK:       @ %bb.0: @ %entry
  83 ; CHECK-NEXT:    vmsr p0, r0
  84 ; CHECK-NEXT:    vpst
  85 ; CHECK-NEXT:    vmult.i32 q0, q1, q2
  86 ; CHECK-NEXT:    bx lr
  87 entry:
  88   %0 = zext i16 %p to i32
  89   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  90   %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
  91   ret <4 x i32> %2
  92 }
  93
  94 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
  95
  96 declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
  97
  98 define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
  99 ; CHECK-LABEL: test_vmulq_m_f16:
 100 ; CHECK:       @ %bb.0: @ %entry
 101 ; CHECK-NEXT:    vmsr p0, r0
 102 ; CHECK-NEXT:    vpst
 103 ; CHECK-NEXT:    vmult.f16 q0, q1, q2
 104 ; CHECK-NEXT:    bx lr
 105 entry:
 106   %0 = zext i16 %p to i32
 107   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
 108   %2 = tail call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
 109   ret <8 x half> %2
 110 }
 111
 112 declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
 113
 114 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
 115 ; CHECK-LABEL: test_vmulq_x_u8:
 116 ; CHECK:       @ %bb.0: @ %entry
 117 ; CHECK-NEXT:    vmsr p0, r0
 118 ; CHECK-NEXT:    vpst
 119 ; CHECK-NEXT:    vmult.i8 q0, q0, q1
 120 ; CHECK-NEXT:    bx lr
 121 entry:
 122   %0 = zext i16 %p to i32
 123   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
 124   %2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
 125   ret <16 x i8> %2
 126 }
 127
 128 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
 129 ; CHECK-LABEL: test_vmulq_x_s16:
 130 ; CHECK:       @ %bb.0: @ %entry
 131 ; CHECK-NEXT:    vmsr p0, r0
 132 ; CHECK-NEXT:    vpst
 133 ; CHECK-NEXT:    vmult.i16 q0, q0, q1
 134 ; CHECK-NEXT:    bx lr
 135 entry:
 136   %0 = zext i16 %p to i32
 137   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
 138   %2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
 139   ret <8 x i16> %2
 140 }
 141
 142 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
 143 ; CHECK-LABEL: test_vmulq_x_u32:
 144 ; CHECK:       @ %bb.0: @ %entry
 145 ; CHECK-NEXT:    vmsr p0, r0
 146 ; CHECK-NEXT:    vpst
 147 ; CHECK-NEXT:    vmult.i32 q0, q0, q1
 148 ; CHECK-NEXT:    bx lr
 149 entry:
 150   %0 = zext i16 %p to i32
 151   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
 152   %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
 153   ret <4 x i32> %2
 154 }
 155
 156 define arm_aapcs_vfpcc <4 x float> @test_vmulq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
 157 ; CHECK-LABEL: test_vmulq_m_f32:
 158 ; CHECK:       @ %bb.0: @ %entry
 159 ; CHECK-NEXT:    vmsr p0, r0
 160 ; CHECK-NEXT:    vpst
 161 ; CHECK-NEXT:    vmult.f32 q0, q0, q1
 162 ; CHECK-NEXT:    bx lr
 163 entry:
 164   %0 = zext i16 %p to i32
 165   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
 166   %2 = tail call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef)
 167   ret <4 x float> %2
 168 }
 169
 170 declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
 171
 172 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_n_u8(<16 x i8> %a, i8 zeroext %b) {
 173 ; CHECK-LABEL: test_vmulq_n_u8:
 174 ; CHECK:       @ %bb.0: @ %entry
 175 ; CHECK-NEXT:    vmul.i8 q0, q0, r0
 176 ; CHECK-NEXT:    bx lr
 177 entry:
 178   %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
 179   %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
 180   %0 = mul <16 x i8> %.splat, %a
 181   ret <16 x i8> %0
 182 }
 183
 184 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) {
 185 ; CHECK-LABEL: test_vmulq_n_s16:
 186 ; CHECK:       @ %bb.0: @ %entry
 187 ; CHECK-NEXT:    vmul.i16 q0, q0, r0
 188 ; CHECK-NEXT:    bx lr
 189 entry:
 190   %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
 191   %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
 192   %0 = mul <8 x i16> %.splat, %a
 193   ret <8 x i16> %0
 194 }
 195
 196 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) {
 197 ; CHECK-LABEL: test_vmulq_n_u32:
 198 ; CHECK:       @ %bb.0: @ %entry
 199 ; CHECK-NEXT:    vmul.i32 q0, q0, r0
 200 ; CHECK-NEXT:    bx lr
 201 entry:
 202   %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
 203   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 204   %0 = mul <4 x i32> %.splat, %a
 205   ret <4 x i32> %0
 206 }
 207
 208 define arm_aapcs_vfpcc <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
 209 ; CHECK-LABEL: test_vmulq_n_f32:
 210 ; CHECK:       @ %bb.0: @ %entry
 211 ; CHECK-NEXT:    vmov r0, s4
 212 ; CHECK-NEXT:    vmul.f32 q0, q0, r0
 213 ; CHECK-NEXT:    bx lr
 214 entry:
 215   %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
 216   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
 217   %0 = fmul <4 x float> %.splat, %a
 218   ret <4 x float> %0
 219 }
 220
 221 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
 222 ; CHECK-LABEL: test_vmulq_m_n_s8:
 223 ; CHECK:       @ %bb.0: @ %entry
 224 ; CHECK-NEXT:    vmsr p0, r1
 225 ; CHECK-NEXT:    vpst
 226 ; CHECK-NEXT:    vmult.i8 q0, q1, r0
 227 ; CHECK-NEXT:    bx lr
 228 entry:
 229   %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
 230   %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
 231   %0 = zext i16 %p to i32
 232   %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
 233   %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
 234   ret <16 x i8> %2
 235 }
 236
 237 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
 238 ; CHECK-LABEL: test_vmulq_m_n_u16:
 239 ; CHECK:       @ %bb.0: @ %entry
 240 ; CHECK-NEXT:    vmsr p0, r1
 241 ; CHECK-NEXT:    vpst
 242 ; CHECK-NEXT:    vmult.i16 q0, q1, r0
 243 ; CHECK-NEXT:    bx lr
 244 entry:
 245   %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
 246   %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
 247   %0 = zext i16 %p to i32
 248   %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
 249   %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive)
 250   ret <8 x i16> %2
 251 }
 252
 253 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
 254 ; CHECK-LABEL: test_vmulq_m_n_s32:
 255 ; CHECK:       @ %bb.0: @ %entry
 256 ; CHECK-NEXT:    vmsr p0, r1
 257 ; CHECK-NEXT:    vpst
 258 ; CHECK-NEXT:    vmult.i32 q0, q1, r0
 259 ; CHECK-NEXT:    bx lr
 260 entry:
 261   %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
 262   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 263   %0 = zext i16 %p to i32
 264   %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
 265   %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive)
 266   ret <4 x i32> %2
 267 }
 268
 269 define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_n_f16(<8 x half> %inactive, <8 x half> %a, float %b.coerce, i16 zeroext %p) {
 270 ; CHECK-LABEL: test_vmulq_m_n_f16:
 271 ; CHECK:       @ %bb.0: @ %entry
 272 ; CHECK-NEXT:    vmov r1, s8
 273 ; CHECK-NEXT:    vmsr p0, r0
 274 ; CHECK-NEXT:    vpst
 275 ; CHECK-NEXT:    vmult.f16 q0, q1, r1
 276 ; CHECK-NEXT:    bx lr
 277 entry:
 278   %0 = bitcast float %b.coerce to i32
 279   %tmp.0.extract.trunc = trunc i32 %0 to i16
 280   %1 = bitcast i16 %tmp.0.extract.trunc to half
 281   %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
 282   %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
 283   %2 = zext i16 %p to i32
 284   %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
 285   %4 = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> %inactive)
 286   ret <8 x half> %4
 287 }
 288
 289 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) {
 290 ; CHECK-LABEL: test_vmulq_x_n_u8:
 291 ; CHECK:       @ %bb.0: @ %entry
 292 ; CHECK-NEXT:    vmsr p0, r1
 293 ; CHECK-NEXT:    vpst
 294 ; CHECK-NEXT:    vmult.i8 q0, q0, r0
 295 ; CHECK-NEXT:    bx lr
 296 entry:
 297   %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
 298   %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
 299   %0 = zext i16 %p to i32
 300   %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
 301   %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> undef)
 302   ret <16 x i8> %2
 303 }
 304
 305 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) {
 306 ; CHECK-LABEL: test_vmulq_x_n_s16:
 307 ; CHECK:       @ %bb.0: @ %entry
 308 ; CHECK-NEXT:    vmsr p0, r1
 309 ; CHECK-NEXT:    vpst
 310 ; CHECK-NEXT:    vmult.i16 q0, q0, r0
 311 ; CHECK-NEXT:    bx lr
 312 entry:
 313   %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
 314   %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
 315   %0 = zext i16 %p to i32
 316   %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
 317   %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef)
 318   ret <8 x i16> %2
 319 }
 320
 321 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) {
 322 ; CHECK-LABEL: test_vmulq_x_n_u32:
 323 ; CHECK:       @ %bb.0: @ %entry
 324 ; CHECK-NEXT:    vmsr p0, r1
 325 ; CHECK-NEXT:    vpst
 326 ; CHECK-NEXT:    vmult.i32 q0, q0, r0
 327 ; CHECK-NEXT:    bx lr
 328 entry:
 329   %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
 330   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 331   %0 = zext i16 %p to i32
 332   %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
 333   %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> undef)
 334   ret <4 x i32> %2
 335 }
 336
 337 define arm_aapcs_vfpcc <4 x float> @test_vmulq_x_n_f32(<4 x float> %a, float %b, i16 zeroext %p) {
 338 ; CHECK-LABEL: test_vmulq_x_n_f32:
 339 ; CHECK:       @ %bb.0: @ %entry
 340 ; CHECK-NEXT:    vmov r1, s4
 341 ; CHECK-NEXT:    vmsr p0, r0
 342 ; CHECK-NEXT:    vpst
 343 ; CHECK-NEXT:    vmult.f32 q0, q0, r1
 344 ; CHECK-NEXT:    bx lr
 345 entry:
 346   %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
 347   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
 348   %0 = zext i16 %p to i32
 349   %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
 350   %2 = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> undef)
 351   ret <4 x float> %2
 352 }