8 inline static int bt1_pop64(uint64_t x
) // the kbi_popcount64() equivalence; similar to popcount_2() in wiki
10 x
-= ((x
>> 1) & 0x5555555555555555llu
);
11 x
= (x
& 0x3333333333333333llu
) + ((x
>> 2) & 0x3333333333333333llu
);
12 x
= (x
+ (x
>> 4)) & 0x0F0F0F0F0F0F0F0Fllu
;
19 inline static int bt1_countInU64(uint64_t dw
, int c
) // the kbi_DNAcount64() equivalence
21 uint64_t dwA
= dw
& 0xAAAAAAAAAAAAAAAAllu
;
22 uint64_t dwNA
= dw
& ~0xAAAAAAAAAAAAAAAAllu
;
25 case 0: tmp
= (dwA
>> 1) | dwNA
; break;
26 case 1: tmp
= ~(dwA
>> 1) & dwNA
; break;
27 case 2: tmp
= (dwA
>> 1) & ~dwNA
; break;
28 default: tmp
= (dwA
>> 1) & dwNA
;
31 if (c
== 0) tmp
= 32 - tmp
;
36 static uint32_t sse2_bit_count32(const __m128i
* block
, const __m128i
* block_end
)
38 const unsigned mu1
= 0x55555555;
39 const unsigned mu2
= 0x33333333;
40 const unsigned mu3
= 0x0F0F0F0F;
41 const unsigned mu4
= 0x0000003F;
46 __m128i m1
= _mm_set_epi32 (mu1
, mu1
, mu1
, mu1
);
47 __m128i m2
= _mm_set_epi32 (mu2
, mu2
, mu2
, mu2
);
48 __m128i m3
= _mm_set_epi32 (mu3
, mu3
, mu3
, mu3
);
49 __m128i m4
= _mm_set_epi32 (mu4
, mu4
, mu4
, mu4
);
51 mcnt
= _mm_xor_si128(m1
, m1
); // cnt = 0
56 __m128i b
= _mm_load_si128(block
);
59 // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
60 tmp1
= _mm_srli_epi32(b
, 1); // tmp1 = (b >> 1 & 0x55555555)
61 tmp1
= _mm_and_si128(tmp1
, m1
);
62 tmp2
= _mm_and_si128(b
, m1
); // tmp2 = (b & 0x55555555)
63 b
= _mm_add_epi32(tmp1
, tmp2
); // b = tmp1 + tmp2
65 // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
66 tmp1
= _mm_srli_epi32(b
, 2); // (b >> 2 & 0x33333333)
67 tmp1
= _mm_and_si128(tmp1
, m2
);
68 tmp2
= _mm_and_si128(b
, m2
); // (b & 0x33333333)
69 b
= _mm_add_epi32(tmp1
, tmp2
); // b = tmp1 + tmp2
71 // b = (b + (b >> 4)) & 0x0F0F0F0F;
72 tmp1
= _mm_srli_epi32(b
, 4); // tmp1 = b >> 4
73 b
= _mm_add_epi32(b
, tmp1
); // b = b + (b >> 4)
74 b
= _mm_and_si128(b
, m3
); // & 0x0F0F0F0F
77 tmp1
= _mm_srli_epi32 (b
, 8); // tmp1 = b >> 8
78 b
= _mm_add_epi32(b
, tmp1
); // b = b + (b >> 8)
80 // b = (b + (b >> 16)) & 0x0000003F;
81 tmp1
= _mm_srli_epi32 (b
, 16); // b >> 16
82 b
= _mm_add_epi32(b
, tmp1
); // b + (b >> 16)
83 b
= _mm_and_si128(b
, m4
); // (b >> 16) & 0x0000003F;
85 mcnt
= _mm_add_epi32(mcnt
, b
); // mcnt += b
87 } while (block
< block_end
);
89 _mm_store_si128((__m128i
*)tcnt
, mcnt
);
91 return tcnt
[0] + tcnt
[1] + tcnt
[2] + tcnt
[3];
101 x
= (uint64_t*)calloc(N
, 8);
103 for (i
= 0; i
< N
; ++i
)
104 x
[i
] = (uint64_t)lrand48() << 32 ^ lrand48();
106 fprintf(stderr
, "\n===> Calculate # of 1 in an integer (popcount) <===\n");
108 t
= clock(); cnt
= 0;
109 for (i
= 0; i
< N
; ++i
) cnt
+= kbi_popcount64(x
[i
]);
110 fprintf(stderr
, "%20s\t%20ld\t%10.6f\n", "kbit", (long)cnt
, (double)(clock() - t
) / CLOCKS_PER_SEC
);
112 t
= clock(); cnt
= 0;
113 for (i
= 0; i
< N
; ++i
) cnt
+= bt1_pop64(x
[i
]);
114 fprintf(stderr
, "%20s\t%20ld\t%10.6f\n", "wiki-popcount_2", (long)cnt
, (double)(clock() - t
) / CLOCKS_PER_SEC
);
116 t
= clock(); cnt
= 0;
117 for (i
= 0; i
< N
; ++i
) cnt
+= __builtin_popcountl(x
[i
]);
118 fprintf(stderr
, "%20s\t%20ld\t%10.6f\n", "__builtin_popcountl", (long)cnt
, (double)(clock() - t
) / CLOCKS_PER_SEC
);
120 t
= clock(); cnt
= 0;
121 cnt
+= sse2_bit_count32((__m128i
*)x
, (__m128i
*)(x
+N
));
122 fprintf(stderr
, "%20s\t%20ld\t%10.6f\n", "SSE2-32bit", (long)cnt
, (double)(clock() - t
) / CLOCKS_PER_SEC
);
124 fprintf(stderr
, "\n===> Count '%c' in 2-bit encoded integers <===\n", "ACGT"[c
]);
126 t
= clock(); cnt
= 0;
127 for (i
= 0; i
< N
; ++i
) cnt
+= kbi_DNAcount64(x
[i
], c
);
128 fprintf(stderr
, "%20s\t%20ld\t%10.6f\n", "kbit", (long)cnt
, (double)(clock() - t
) / CLOCKS_PER_SEC
);
130 t
= clock(); cnt
= 0;
131 for (i
= 0; i
< N
; ++i
) cnt
+= bt1_countInU64(x
[i
], c
);
132 fprintf(stderr
, "%20s\t%20ld\t%10.6f\n", "bowtie1", (long)cnt
, (double)(clock() - t
) / CLOCKS_PER_SEC
);
134 fprintf(stderr
, "\n");