1 /* { dg-do compile } */
2 /* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
3 /* { dg-require-effective-target vect_int } */
4 /* { dg-require-effective-target vect_shift } */
5 /* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
6 /* { dg-additional-options "--param max-completely-peeled-insns=200" { target powerpc64*-*-* } } */
8 typedef unsigned short ggml_fp16_t
;
9 static float table_f32_f16
[1 << 16];
11 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f
) {
13 __builtin_memcpy(&s
, &f
, sizeof(unsigned short));
14 return table_f32_f16
[s
];
21 unsigned char qs
[32 / 2];
30 void ggml_vec_dot_q5_1_q8_1(const int n
, float * restrict s
, const void * restrict vx
, const void * restrict vy
) {
32 const int nb
= n
/ qk
;
34 const block_q5_1
* restrict x
= vx
;
35 const block_q8_1
* restrict y
= vy
;
39 for (int i
= 0; i
< nb
; i
++) {
41 __builtin_memcpy(&qh
, x
[i
].qh
, sizeof(qh
));
45 for (int j
= 0; j
< qk
/2; ++j
) {
46 const unsigned char xh_0
= ((qh
>> (j
+ 0)) << 4) & 0x10;
47 const unsigned char xh_1
= ((qh
>> (j
+ 12)) ) & 0x10;
49 const int x0
= (x
[i
].qs
[j
] & 0xF) | xh_0
;
50 const int x1
= (x
[i
].qs
[j
] >> 4) | xh_1
;
52 sumi
+= (x0
* y
[i
].qs
[j
]) + (x1
* y
[i
].qs
[j
+ qk
/2]);
55 sumf
+= (ggml_lookup_fp16_to_fp32(x
[i
].d
)*y
[i
].d
)*sumi
+ ggml_lookup_fp16_to_fp32(x
[i
].m
)*y
[i
].s
;
61 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */