gcc/testsuite/gcc.dg/vect/pr112325.c

   1 /* { dg-do compile } */
   2 /* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
   3 /* { dg-require-effective-target vect_int } */
   4 /* { dg-require-effective-target vect_shift } */
   5 /* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
   6 /* { dg-additional-options "--param max-completely-peeled-insns=200" { target powerpc64*-*-* } } */
   7
   8 typedef unsigned short ggml_fp16_t;
   9 static float table_f32_f16[1 << 16];
  10
  11 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
  12     unsigned short s;
  13     __builtin_memcpy(&s, &f, sizeof(unsigned short));
  14     return table_f32_f16[s];
  15 }
  16
  17 typedef struct {
  18     ggml_fp16_t d;
  19     ggml_fp16_t m;
  20     unsigned char qh[4];
  21     unsigned char qs[32 / 2];
  22 } block_q5_1;
  23
  24 typedef struct {
  25     float d;
  26     float s;
  27     char qs[32];
  28 } block_q8_1;
  29
  30 void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
  31     const int qk = 32;
  32     const int nb = n / qk;
  33
  34     const block_q5_1 * restrict x = vx;
  35     const block_q8_1 * restrict y = vy;
  36
  37     float sumf = 0.0;
  38
  39     for (int i = 0; i < nb; i++) {
  40         unsigned qh;
  41         __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
  42
  43         int sumi = 0;
  44
  45         for (int j = 0; j < qk/2; ++j) {
  46             const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
  47             const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
  48
  49             const int x0 = (x[i].qs[j] & 0xF) | xh_0;
  50             const int x1 = (x[i].qs[j] >> 4) | xh_1;
  51
  52             sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
  53         }
  54
  55         sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
  56     }
  57
  58     *s = sumf;
  59 }
  60
  61 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */