test/CodeGen/ARM/vqdmul.ll

   1 ; RUN: llc -mattr=+neon < %s | FileCheck %s
   2 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
   3 target triple = "thumbv7-elf"
   4
   5 define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
   6 ;CHECK-LABEL: vqdmulhs16:
   7 ;CHECK: vqdmulh.s16
   8         %tmp1 = load <4 x i16>, <4 x i16>* %A
   9         %tmp2 = load <4 x i16>, <4 x i16>* %B
  10         %tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
  11         ret <4 x i16> %tmp3
  12 }
  13
  14 define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
  15 ;CHECK-LABEL: vqdmulhs32:
  16 ;CHECK: vqdmulh.s32
  17         %tmp1 = load <2 x i32>, <2 x i32>* %A
  18         %tmp2 = load <2 x i32>, <2 x i32>* %B
  19         %tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
  20         ret <2 x i32> %tmp3
  21 }
  22
  23 define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
  24 ;CHECK-LABEL: vqdmulhQs16:
  25 ;CHECK: vqdmulh.s16
  26         %tmp1 = load <8 x i16>, <8 x i16>* %A
  27         %tmp2 = load <8 x i16>, <8 x i16>* %B
  28         %tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
  29         ret <8 x i16> %tmp3
  30 }
  31
  32 define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
  33 ;CHECK-LABEL: vqdmulhQs32:
  34 ;CHECK: vqdmulh.s32
  35         %tmp1 = load <4 x i32>, <4 x i32>* %A
  36         %tmp2 = load <4 x i32>, <4 x i32>* %B
  37         %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
  38         ret <4 x i32> %tmp3
  39 }
  40
  41 define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
  42 entry:
  43 ; CHECK: test_vqdmulhQ_lanes16
  44 ; CHECK: vqdmulh.s16 q0, q0, d2[1]
  45   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
  46   %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
  47   ret <8 x i16> %1
  48 }
  49
  50 define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
  51 entry:
  52 ; CHECK: test_vqdmulhQ_lanes32
  53 ; CHECK: vqdmulh.s32 q0, q0, d2[1]
  54   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
  55   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
  56   ret <4 x i32> %1
  57 }
  58
  59 define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
  60 entry:
  61 ; CHECK: test_vqdmulh_lanes16
  62 ; CHECK: vqdmulh.s16 d0, d0, d1[1]
  63   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
  64   %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
  65   ret <4 x i16> %1
  66 }
  67
  68 define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
  69 entry:
  70 ; CHECK: test_vqdmulh_lanes32
  71 ; CHECK: vqdmulh.s32 d0, d0, d1[1]
  72   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
  73   %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
  74   ret <2 x i32> %1
  75 }
  76
  77 declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
  78 declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
  79
  80 declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
  81 declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
  82
  83 define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
  84 ;CHECK-LABEL: vqrdmulhs16:
  85 ;CHECK: vqrdmulh.s16
  86         %tmp1 = load <4 x i16>, <4 x i16>* %A
  87         %tmp2 = load <4 x i16>, <4 x i16>* %B
  88         %tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
  89         ret <4 x i16> %tmp3
  90 }
  91
  92 define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
  93 ;CHECK-LABEL: vqrdmulhs32:
  94 ;CHECK: vqrdmulh.s32
  95         %tmp1 = load <2 x i32>, <2 x i32>* %A
  96         %tmp2 = load <2 x i32>, <2 x i32>* %B
  97         %tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
  98         ret <2 x i32> %tmp3
  99 }
 100
 101 define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 102 ;CHECK-LABEL: vqrdmulhQs16:
 103 ;CHECK: vqrdmulh.s16
 104         %tmp1 = load <8 x i16>, <8 x i16>* %A
 105         %tmp2 = load <8 x i16>, <8 x i16>* %B
 106         %tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 107         ret <8 x i16> %tmp3
 108 }
 109
 110 define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 111 ;CHECK-LABEL: vqrdmulhQs32:
 112 ;CHECK: vqrdmulh.s32
 113         %tmp1 = load <4 x i32>, <4 x i32>* %A
 114         %tmp2 = load <4 x i32>, <4 x i32>* %B
 115         %tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 116         ret <4 x i32> %tmp3
 117 }
 118
 119 define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
 120 entry:
 121 ; CHECK: test_vqRdmulhQ_lanes16
 122 ; CHECK: vqrdmulh.s16 q0, q0, d2[1]
 123   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
 124   %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
 125   ret <8 x i16> %1
 126 }
 127
 128 define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
 129 entry:
 130 ; CHECK: test_vqRdmulhQ_lanes32
 131 ; CHECK: vqrdmulh.s32 q0, q0, d2[1]
 132   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
 133   %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
 134   ret <4 x i32> %1
 135 }
 136
 137 define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
 138 entry:
 139 ; CHECK: test_vqRdmulh_lanes16
 140 ; CHECK: vqrdmulh.s16 d0, d0, d1[1]
 141   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
 142   %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
 143   ret <4 x i16> %1
 144 }
 145
 146 define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
 147 entry:
 148 ; CHECK: test_vqRdmulh_lanes32
 149 ; CHECK: vqrdmulh.s32 d0, d0, d1[1]
 150   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
 151   %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
 152   ret <2 x i32> %1
 153 }
 154
 155 declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 156 declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 157
 158 declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
 159 declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 160
 161 define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 162 ;CHECK-LABEL: vqdmulls16:
 163 ;CHECK: vqdmull.s16
 164         %tmp1 = load <4 x i16>, <4 x i16>* %A
 165         %tmp2 = load <4 x i16>, <4 x i16>* %B
 166         %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
 167         ret <4 x i32> %tmp3
 168 }
 169
 170 define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 171 ;CHECK-LABEL: vqdmulls32:
 172 ;CHECK: vqdmull.s32
 173         %tmp1 = load <2 x i32>, <2 x i32>* %A
 174         %tmp2 = load <2 x i32>, <2 x i32>* %B
 175         %tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
 176         ret <2 x i64> %tmp3
 177 }
 178
 179 define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
 180 entry:
 181 ; CHECK: test_vqdmull_lanes16
 182 ; CHECK: vqdmull.s16 q0, d0, d1[1]
 183   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
 184   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
 185   ret <4 x i32> %1
 186 }
 187
 188 define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
 189 entry:
 190 ; CHECK: test_vqdmull_lanes32
 191 ; CHECK: vqdmull.s32 q0, d0, d1[1]
 192   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
 193   %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
 194   ret <2 x i64> %1
 195 }
 196
 197 declare <4 x i32>  @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
 198 declare <2 x i64>  @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 199
 200 define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
 201 ;CHECK-LABEL: vqdmlals16_natural:
 202 ;CHECK: vqdmlal.s16
 203         %tmp1 = load <4 x i32>, <4 x i32>* %A
 204         %tmp2 = load <4 x i16>, <4 x i16>* %B
 205         %tmp3 = load <4 x i16>, <4 x i16>* %C
 206         %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
 207         %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
 208         ret <4 x i32> %tmp5
 209 }
 210
 211 define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
 212 ;CHECK-LABEL: vqdmlals32_natural:
 213 ;CHECK: vqdmlal.s32
 214         %tmp1 = load <2 x i64>, <2 x i64>* %A
 215         %tmp2 = load <2 x i32>, <2 x i32>* %B
 216         %tmp3 = load <2 x i32>, <2 x i32>* %C
 217         %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
 218         %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
 219         ret <2 x i64> %tmp5
 220 }
 221
 222 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
 223 entry:
 224 ; CHECK-LABEL: test_vqdmlal_lanes16_natural:
 225 ; CHECK: vqdmlal.s16 q0, d2, d3[1]
 226   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
 227   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
 228   %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
 229   ret <4 x i32> %2
 230 }
 231
 232 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
 233 entry:
 234 ; CHECK-LABEL: test_vqdmlal_lanes32_natural:
 235 ; CHECK: vqdmlal.s32 q0, d2, d3[1]
 236   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
 237   %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
 238   %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
 239   ret <2 x i64> %2
 240 }
 241
 242 declare <4 x i32>  @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 243 declare <2 x i64>  @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 244
 245 define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
 246 ;CHECK-LABEL: vqdmlsls16_natural:
 247 ;CHECK: vqdmlsl.s16
 248         %tmp1 = load <4 x i32>, <4 x i32>* %A
 249         %tmp2 = load <4 x i16>, <4 x i16>* %B
 250         %tmp3 = load <4 x i16>, <4 x i16>* %C
 251         %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
 252         %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
 253         ret <4 x i32> %tmp5
 254 }
 255
 256 define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
 257 ;CHECK-LABEL: vqdmlsls32_natural:
 258 ;CHECK: vqdmlsl.s32
 259         %tmp1 = load <2 x i64>, <2 x i64>* %A
 260         %tmp2 = load <2 x i32>, <2 x i32>* %B
 261         %tmp3 = load <2 x i32>, <2 x i32>* %C
 262         %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
 263         %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
 264         ret <2 x i64> %tmp5
 265 }
 266
 267 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
 268 entry:
 269 ; CHECK-LABEL: test_vqdmlsl_lanes16_natural:
 270 ; CHECK: vqdmlsl.s16 q0, d2, d3[1]
 271   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
 272   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
 273   %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
 274   ret <4 x i32> %2
 275 }
 276
 277 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
 278 entry:
 279 ; CHECK-LABEL: test_vqdmlsl_lanes32_natural:
 280 ; CHECK: vqdmlsl.s32 q0, d2, d3[1]
 281   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
 282   %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
 283   %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
 284   ret <2 x i64> %2
 285 }
 286
 287 declare <4 x i32>  @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 288 declare <2 x i64>  @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone