clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c

   1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
   2 // RUN: -S -disable-O0-optnone -emit-llvm -o - %s \
   3 // RUN: | opt -S -passes=mem2reg,sroa \
   4 // RUN: | FileCheck %s
   5
   6 // REQUIRES: aarch64-registered-target
   7
   8 #include <arm_neon.h>
   9
  10 // CHECK-LABEL: test_vmmlaq_s32
  11 // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
  12 // CHECK: ret <4 x i32> [[VAL]]
  13 int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) {
  14   return vmmlaq_s32(r, a, b);
  15 }
  16
  17 // CHECK-LABEL: test_vmmlaq_u32
  18 // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
  19 // CHECK: ret <4 x i32> [[VAL]]
  20 uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
  21   return vmmlaq_u32(r, a, b);
  22 }
  23
  24 // CHECK-LABEL: test_vusmmlaq_s32
  25 // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
  26 // CHECK: ret <4 x i32> [[VAL]]
  27 int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
  28   return vusmmlaq_s32(r, a, b);
  29 }
  30
  31 // CHECK-LABEL: test_vusdot_s32
  32 // CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
  33 // CHECK: ret <2 x i32> [[VAL]]
  34 int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
  35   return vusdot_s32(r, a, b);
  36 }
  37
  38 // CHECK-LABEL: test_vusdot_lane_s32
  39 // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
  40 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
  41 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
  42 // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
  43 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
  44 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
  45 // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
  46 // CHECK: ret <2 x i32> [[OP]]
  47 int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
  48   return vusdot_lane_s32(r, a, b, 0);
  49 }
  50
  51 // CHECK-LABEL: test_vsudot_lane_s32
  52 // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
  53 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %0 to <8 x i8>
  54 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> %1 to <2 x i32>
  55 // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
  56 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
  57 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
  58 // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
  59 // CHECK: ret <2 x i32> [[OP]]
  60 int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) {
  61   return vsudot_lane_s32(r, a, b, 0);
  62 }
  63
  64 // CHECK-LABEL: test_vusdot_laneq_s32
  65 // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
  66 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
  67 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
  68 // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
  69 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
  70 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
  71 // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
  72 // CHECK: ret <2 x i32> [[OP]]
  73 int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) {
  74   return vusdot_laneq_s32(r, a, b, 0);
  75 }
  76
  77 // CHECK-LABEL: test_vsudot_laneq_s32
  78 // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
  79 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
  80 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
  81 // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
  82 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
  83 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
  84 // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
  85 // CHECK: ret <2 x i32> [[OP]]
  86 int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) {
  87   return vsudot_laneq_s32(r, a, b, 0);
  88 }
  89
  90 // CHECK-LABEL: test_vusdotq_s32
  91 // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
  92 // CHECK: ret <4 x i32> [[VAL]]
  93 int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
  94   return vusdotq_s32(r, a, b);
  95 }
  96
  97 // CHECK-LABEL: test_vusdotq_lane_s32
  98 // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
  99 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
 100 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
 101 // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
 102 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
 103 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
 104 // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
 105 // CHECK: ret <4 x i32> [[OP]]
 106 int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) {
 107   return vusdotq_lane_s32(r, a, b, 0);
 108 }
 109
 110 // CHECK-LABEL: test_vsudotq_lane_s32
 111 // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
 112 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
 113 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
 114 // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
 115 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
 116 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
 117 // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
 118 // CHECK: ret <4 x i32> [[OP]]
 119 int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) {
 120   return vsudotq_lane_s32(r, a, b, 0);
 121 }
 122
 123 // CHECK-LABEL: test_vusdotq_laneq_s32
 124 // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
 125 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
 126 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
 127 // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
 128 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
 129 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
 130 // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
 131 // CHECK: ret <4 x i32> [[OP]]
 132 int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
 133   return vusdotq_laneq_s32(r, a, b, 0);
 134 }
 135
 136 // CHECK-LABEL: test_vsudotq_laneq_s32
 137 // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
 138 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
 139 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
 140 // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
 141 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
 142 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
 143 // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
 144 // CHECK: ret <4 x i32> [[OP]]
 145 int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) {
 146   return vsudotq_laneq_s32(r, a, b, 0);
 147 }