1 // SPDX-License-Identifier: 0BSD
3 ///////////////////////////////////////////////////////////////////////////////
5 /// \file tuklib_integer.h
6 /// \brief Various integer and bit operations
8 /// This file provides macros or functions to do some basic integer and bit
11 /// Native endian inline functions (XX = 16, 32, or 64):
12 /// - Unaligned native endian reads: readXXne(ptr)
13 /// - Unaligned native endian writes: writeXXne(ptr, num)
14 /// - Aligned native endian reads: aligned_readXXne(ptr)
15 /// - Aligned native endian writes: aligned_writeXXne(ptr, num)
17 /// Endianness-converting integer operations (these can be macros!)
18 /// (XX = 16, 32, or 64; Y = b or l):
19 /// - Byte swapping: byteswapXX(num)
20 /// - Byte order conversions to/from native (byteswaps if Y isn't
21 /// the native endianness): convXXYe(num)
22 /// - Unaligned reads: readXXYe(ptr)
23 /// - Unaligned writes: writeXXYe(ptr, num)
24 /// - Aligned reads: aligned_readXXYe(ptr)
25 /// - Aligned writes: aligned_writeXXYe(ptr, num)
27 /// Since the above can macros, the arguments should have no side effects
28 /// because they may be evaluated more than once.
30 /// Bit scan operations for non-zero 32-bit integers (inline functions):
31 /// - Bit scan reverse (find highest non-zero bit): bsr32(num)
32 /// - Count leading zeros: clz32(num)
33 /// - Count trailing zeros: ctz32(num)
34 /// - Bit scan forward (simply an alias for ctz32()): bsf32(num)
36 /// The above bit scan operations return 0-31. If num is zero,
37 /// the result is undefined.
39 // Authors: Lasse Collin
42 ///////////////////////////////////////////////////////////////////////////////
44 #ifndef TUKLIB_INTEGER_H
45 #define TUKLIB_INTEGER_H
47 #include "tuklib_common.h"
50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
51 // and such functions.
52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
53 # include <immintrin.h>
54 // Only include <intrin.h> when it is needed. GCC and Clang can both
55 // use __builtin's, so we only need Windows instrincs when using MSVC.
56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
67 #if defined(HAVE___BUILTIN_BSWAPXX)
68 // GCC >= 4.8 and Clang
69 # define byteswap16(num) __builtin_bswap16(num)
70 # define byteswap32(num) __builtin_bswap32(num)
71 # define byteswap64(num) __builtin_bswap64(num)
73 #elif defined(HAVE_BYTESWAP_H)
74 // glibc, uClibc, dietlibc
75 # include <byteswap.h>
77 # define byteswap16(num) bswap_16(num)
80 # define byteswap32(num) bswap_32(num)
83 # define byteswap64(num) bswap_64(num)
86 #elif defined(HAVE_SYS_ENDIAN_H)
88 # include <sys/endian.h>
89 # define byteswap16(num) bswap16(num)
90 # define byteswap32(num) bswap32(num)
91 # define byteswap64(num) bswap64(num)
93 #elif defined(HAVE_SYS_BYTEORDER_H)
95 # include <sys/byteorder.h>
97 # define byteswap16(num) BSWAP_16(num)
100 # define byteswap32(num) BSWAP_32(num)
103 # define byteswap64(num) BSWAP_64(num)
106 # define conv16be(num) BE_16(num)
109 # define conv32be(num) BE_32(num)
112 # define conv64be(num) BE_64(num)
115 # define conv16le(num) LE_16(num)
118 # define conv32le(num) LE_32(num)
121 # define conv64le(num) LE_64(num)
126 # define byteswap16(n) (uint16_t)( \
127 (((n) & 0x00FFU) << 8) \
128 | (((n) & 0xFF00U) >> 8) \
133 # define byteswap32(n) (uint32_t)( \
134 (((n) & UINT32_C(0x000000FF)) << 24) \
135 | (((n) & UINT32_C(0x0000FF00)) << 8) \
136 | (((n) & UINT32_C(0x00FF0000)) >> 8) \
137 | (((n) & UINT32_C(0xFF000000)) >> 24) \
142 # define byteswap64(n) (uint64_t)( \
143 (((n) & UINT64_C(0x00000000000000FF)) << 56) \
144 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \
145 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
146 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \
147 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
148 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
149 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
150 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
154 // Define conversion macros using the basic byte swapping macros.
155 #ifdef WORDS_BIGENDIAN
157 # define conv16be(num) ((uint16_t)(num))
160 # define conv32be(num) ((uint32_t)(num))
163 # define conv64be(num) ((uint64_t)(num))
166 # define conv16le(num) byteswap16(num)
169 # define conv32le(num) byteswap32(num)
172 # define conv64le(num) byteswap64(num)
176 # define conv16be(num) byteswap16(num)
179 # define conv32be(num) byteswap32(num)
182 # define conv64be(num) byteswap64(num)
185 # define conv16le(num) ((uint16_t)(num))
188 # define conv32le(num) ((uint32_t)(num))
191 # define conv64le(num) ((uint64_t)(num))
196 ////////////////////////////////
197 // Unaligned reads and writes //
198 ////////////////////////////////
200 // No-strict-align archs like x86-64
201 // ---------------------------------
203 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
204 // is bad even if the uint8_pointer is properly aligned because this kind
205 // of casts break strict aliasing rules and result in undefined behavior.
206 // With unaligned pointers it's even worse: compilers may emit vector
207 // instructions that require aligned pointers even if non-vector
208 // instructions work with unaligned pointers.
210 // Using memcpy() is the standard compliant way to do unaligned access.
211 // Many modern compilers inline it so there is no function call overhead.
212 // For those compilers that don't handle the memcpy() method well, the
213 // old casting method (that violates strict aliasing) can be requested at
214 // build time. A third method, casting to a packed struct, would also be
215 // an option but isn't provided to keep things simpler (it's already a mess).
216 // Hopefully this is flexible enough in practice.
218 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
220 // buf[0] | (buf[1] << 8)
222 // reads a 16-bit value and can emit a single 16-bit load and produce
223 // identical code than with the memcpy() method. In other cases Clang and GCC
224 // produce either the same or better code with memcpy(). For example, Clang 9
225 // on x86-64 can detect 32-bit load but not 16-bit load.
227 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
228 // code for "buf[0] | (buf[1] << 8)".
230 // Conclusion: The memcpy() method is the best choice when unaligned access
233 // Strict-align archs like SPARC
234 // -----------------------------
236 // GCC versions from around 4.x to to at least 13.2.0 produce worse code
237 // from the memcpy() method than from simple byte-by-byte shift-or code
238 // when reading a 32-bit integer:
240 // (1) It may be constructed on stack using using four 8-bit loads,
241 // four 8-bit stores to stack, and finally one 32-bit load from stack.
243 // (2) Especially with -Os, an actual memcpy() call may be emitted.
245 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
246 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
247 // some processors but not all so this is relevant only in the case when
248 // GCC assumes that unaligned is not supported or -mstrict-align or
249 // -mno-unaligned-access is used.
251 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
252 // was one the very few with a minor difference: the memcpy() version
253 // was one instruction longer.
255 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
256 // the best choice for strict-align archs to do unaligned access.
258 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
260 // Thanks to <https://godbolt.org/> it was easy to test different compilers.
261 // The following is for little endian targets:
266 uint32_t bytes16(const uint8_t *b)
268 return (uint32_t)b[0]
269 | ((uint32_t)b[1] << 8);
272 uint32_t copy16(const uint8_t *b)
275 memcpy(&v, b, sizeof(v));
279 uint32_t bytes32(const uint8_t *b)
281 return (uint32_t)b[0]
282 | ((uint32_t)b[1] << 8)
283 | ((uint32_t)b[2] << 16)
284 | ((uint32_t)b[3] << 24);
287 uint32_t copy32(const uint8_t *b)
290 memcpy(&v, b, sizeof(v));
294 void wbytes16(uint8_t *b, uint16_t v)
297 b[1] = (uint8_t)(v >> 8);
300 void wcopy16(uint8_t *b, uint16_t v)
302 memcpy(b, &v, sizeof(v));
305 void wbytes32(uint8_t *b, uint32_t v)
308 b[1] = (uint8_t)(v >> 8);
309 b[2] = (uint8_t)(v >> 16);
310 b[3] = (uint8_t)(v >> 24);
313 void wcopy32(uint8_t *b, uint32_t v)
315 memcpy(b, &v, sizeof(v));
320 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
322 static inline uint16_t
323 read16ne(const uint8_t *buf
)
325 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
326 return *(const uint16_t *)buf
;
329 memcpy(&num
, buf
, sizeof(num
));
335 static inline uint32_t
336 read32ne(const uint8_t *buf
)
338 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
339 return *(const uint32_t *)buf
;
342 memcpy(&num
, buf
, sizeof(num
));
348 static inline uint64_t
349 read64ne(const uint8_t *buf
)
351 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
352 return *(const uint64_t *)buf
;
355 memcpy(&num
, buf
, sizeof(num
));
362 write16ne(uint8_t *buf
, uint16_t num
)
364 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
365 *(uint16_t *)buf
= num
;
367 memcpy(buf
, &num
, sizeof(num
));
374 write32ne(uint8_t *buf
, uint32_t num
)
376 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
377 *(uint32_t *)buf
= num
;
379 memcpy(buf
, &num
, sizeof(num
));
386 write64ne(uint8_t *buf
, uint64_t num
)
388 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
389 *(uint64_t *)buf
= num
;
391 memcpy(buf
, &num
, sizeof(num
));
397 static inline uint16_t
398 read16be(const uint8_t *buf
)
400 uint16_t num
= read16ne(buf
);
401 return conv16be(num
);
405 static inline uint16_t
406 read16le(const uint8_t *buf
)
408 uint16_t num
= read16ne(buf
);
409 return conv16le(num
);
413 static inline uint32_t
414 read32be(const uint8_t *buf
)
416 uint32_t num
= read32ne(buf
);
417 return conv32be(num
);
421 static inline uint32_t
422 read32le(const uint8_t *buf
)
424 uint32_t num
= read32ne(buf
);
425 return conv32le(num
);
429 static inline uint64_t
430 read64be(const uint8_t *buf
)
432 uint64_t num
= read64ne(buf
);
433 return conv64be(num
);
437 static inline uint64_t
438 read64le(const uint8_t *buf
)
440 uint64_t num
= read64ne(buf
);
441 return conv64le(num
);
445 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
446 // to optimize byte swapping of constants when using glibc's or *BSD's
447 // byte swapping macros. The actual write is done in an inline function
448 // to make type checking of the buf pointer possible.
449 #define write16be(buf, num) write16ne(buf, conv16be(num))
450 #define write32be(buf, num) write32ne(buf, conv32be(num))
451 #define write64be(buf, num) write64ne(buf, conv64be(num))
452 #define write16le(buf, num) write16ne(buf, conv16le(num))
453 #define write32le(buf, num) write32ne(buf, conv32le(num))
454 #define write64le(buf, num) write64ne(buf, conv64le(num))
458 #ifdef WORDS_BIGENDIAN
459 # define read16ne read16be
460 # define read32ne read32be
461 # define read64ne read64be
462 # define write16ne write16be
463 # define write32ne write32be
464 # define write64ne write64be
466 # define read16ne read16le
467 # define read32ne read32le
468 # define read64ne read64le
469 # define write16ne write16le
470 # define write32ne write32le
471 # define write64ne write64le
475 static inline uint16_t
476 read16be(const uint8_t *buf
)
478 uint16_t num
= ((uint16_t)buf
[0] << 8) | (uint16_t)buf
[1];
483 static inline uint16_t
484 read16le(const uint8_t *buf
)
486 uint16_t num
= ((uint16_t)buf
[0]) | ((uint16_t)buf
[1] << 8);
491 static inline uint32_t
492 read32be(const uint8_t *buf
)
494 uint32_t num
= (uint32_t)buf
[0] << 24;
495 num
|= (uint32_t)buf
[1] << 16;
496 num
|= (uint32_t)buf
[2] << 8;
497 num
|= (uint32_t)buf
[3];
502 static inline uint32_t
503 read32le(const uint8_t *buf
)
505 uint32_t num
= (uint32_t)buf
[0];
506 num
|= (uint32_t)buf
[1] << 8;
507 num
|= (uint32_t)buf
[2] << 16;
508 num
|= (uint32_t)buf
[3] << 24;
513 static inline uint64_t
514 read64be(const uint8_t *buf
)
516 uint64_t num
= (uint64_t)buf
[0] << 56;
517 num
|= (uint64_t)buf
[1] << 48;
518 num
|= (uint64_t)buf
[2] << 40;
519 num
|= (uint64_t)buf
[3] << 32;
520 num
|= (uint64_t)buf
[4] << 24;
521 num
|= (uint64_t)buf
[5] << 16;
522 num
|= (uint64_t)buf
[6] << 8;
523 num
|= (uint64_t)buf
[7];
528 static inline uint64_t
529 read64le(const uint8_t *buf
)
531 uint64_t num
= (uint64_t)buf
[0];
532 num
|= (uint64_t)buf
[1] << 8;
533 num
|= (uint64_t)buf
[2] << 16;
534 num
|= (uint64_t)buf
[3] << 24;
535 num
|= (uint64_t)buf
[4] << 32;
536 num
|= (uint64_t)buf
[5] << 40;
537 num
|= (uint64_t)buf
[6] << 48;
538 num
|= (uint64_t)buf
[7] << 56;
544 write16be(uint8_t *buf
, uint16_t num
)
546 buf
[0] = (uint8_t)(num
>> 8);
547 buf
[1] = (uint8_t)num
;
553 write16le(uint8_t *buf
, uint16_t num
)
555 buf
[0] = (uint8_t)num
;
556 buf
[1] = (uint8_t)(num
>> 8);
562 write32be(uint8_t *buf
, uint32_t num
)
564 buf
[0] = (uint8_t)(num
>> 24);
565 buf
[1] = (uint8_t)(num
>> 16);
566 buf
[2] = (uint8_t)(num
>> 8);
567 buf
[3] = (uint8_t)num
;
573 write32le(uint8_t *buf
, uint32_t num
)
575 buf
[0] = (uint8_t)num
;
576 buf
[1] = (uint8_t)(num
>> 8);
577 buf
[2] = (uint8_t)(num
>> 16);
578 buf
[3] = (uint8_t)(num
>> 24);
584 write64be(uint8_t *buf
, uint64_t num
)
586 buf
[0] = (uint8_t)(num
>> 56);
587 buf
[1] = (uint8_t)(num
>> 48);
588 buf
[2] = (uint8_t)(num
>> 40);
589 buf
[3] = (uint8_t)(num
>> 32);
590 buf
[4] = (uint8_t)(num
>> 24);
591 buf
[5] = (uint8_t)(num
>> 16);
592 buf
[6] = (uint8_t)(num
>> 8);
593 buf
[7] = (uint8_t)num
;
599 write64le(uint8_t *buf
, uint64_t num
)
601 buf
[0] = (uint8_t)num
;
602 buf
[1] = (uint8_t)(num
>> 8);
603 buf
[2] = (uint8_t)(num
>> 16);
604 buf
[3] = (uint8_t)(num
>> 24);
605 buf
[4] = (uint8_t)(num
>> 32);
606 buf
[5] = (uint8_t)(num
>> 40);
607 buf
[6] = (uint8_t)(num
>> 48);
608 buf
[7] = (uint8_t)(num
>> 56);
615 //////////////////////////////
616 // Aligned reads and writes //
617 //////////////////////////////
619 // Separate functions for aligned reads and writes are provided since on
620 // strict-align archs aligned access is much faster than unaligned access.
622 // Just like in the unaligned case, memcpy() is needed to avoid
623 // strict aliasing violations. However, on archs that don't support
624 // unaligned access the compiler cannot know that the pointers given
625 // to memcpy() are aligned which results in slow code. As of C11 there is
626 // no standard way to tell the compiler that we know that the address is
627 // aligned but some compilers have language extensions to do that. With
628 // such language extensions the memcpy() method gives excellent results.
630 // What to do on a strict-align system when no known language extensions
631 // are available? Falling back to byte-by-byte access would be safe but ruin
632 // optimizations that have been made specifically with aligned access in mind.
633 // As a compromise, aligned reads will fall back to non-compliant type punning
634 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
635 // over fast writes. This obviously isn't great but hopefully it's a working
636 // compromise for now.
638 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
639 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
640 # define tuklib_memcpy_aligned(dest, src, size) \
641 memcpy(dest, __builtin_assume_aligned(src, size), size)
643 # define tuklib_memcpy_aligned(dest, src, size) \
644 memcpy(dest, src, size)
645 # ifndef TUKLIB_FAST_UNALIGNED_ACCESS
646 # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
651 static inline uint16_t
652 aligned_read16ne(const uint8_t *buf
)
654 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
655 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
656 return *(const uint16_t *)buf
;
659 tuklib_memcpy_aligned(&num
, buf
, sizeof(num
));
665 static inline uint32_t
666 aligned_read32ne(const uint8_t *buf
)
668 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
669 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
670 return *(const uint32_t *)buf
;
673 tuklib_memcpy_aligned(&num
, buf
, sizeof(num
));
679 static inline uint64_t
680 aligned_read64ne(const uint8_t *buf
)
682 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
683 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
684 return *(const uint64_t *)buf
;
687 tuklib_memcpy_aligned(&num
, buf
, sizeof(num
));
694 aligned_write16ne(uint8_t *buf
, uint16_t num
)
696 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
697 *(uint16_t *)buf
= num
;
699 tuklib_memcpy_aligned(buf
, &num
, sizeof(num
));
706 aligned_write32ne(uint8_t *buf
, uint32_t num
)
708 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
709 *(uint32_t *)buf
= num
;
711 tuklib_memcpy_aligned(buf
, &num
, sizeof(num
));
718 aligned_write64ne(uint8_t *buf
, uint64_t num
)
720 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
721 *(uint64_t *)buf
= num
;
723 tuklib_memcpy_aligned(buf
, &num
, sizeof(num
));
729 static inline uint16_t
730 aligned_read16be(const uint8_t *buf
)
732 uint16_t num
= aligned_read16ne(buf
);
733 return conv16be(num
);
737 static inline uint16_t
738 aligned_read16le(const uint8_t *buf
)
740 uint16_t num
= aligned_read16ne(buf
);
741 return conv16le(num
);
745 static inline uint32_t
746 aligned_read32be(const uint8_t *buf
)
748 uint32_t num
= aligned_read32ne(buf
);
749 return conv32be(num
);
753 static inline uint32_t
754 aligned_read32le(const uint8_t *buf
)
756 uint32_t num
= aligned_read32ne(buf
);
757 return conv32le(num
);
761 static inline uint64_t
762 aligned_read64be(const uint8_t *buf
)
764 uint64_t num
= aligned_read64ne(buf
);
765 return conv64be(num
);
769 static inline uint64_t
770 aligned_read64le(const uint8_t *buf
)
772 uint64_t num
= aligned_read64ne(buf
);
773 return conv64le(num
);
777 // These need to be macros like in the unaligned case.
778 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
779 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
780 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
781 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
782 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
783 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
790 static inline uint32_t
793 // Check for ICC first, since it tends to define __GNUC__ too.
794 #if defined(__INTEL_COMPILER)
795 return _bit_scan_reverse(n
);
797 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
798 // GCC >= 3.4 has __builtin_clz(), which gives good results on
799 // multiple architectures. On x86, __builtin_clz() ^ 31U becomes
800 // either plain BSR (so the XOR gets optimized away) or LZCNT and
801 // XOR (if -march indicates that SSE4a instructions are supported).
802 return (uint32_t)__builtin_clz(n
) ^ 31U;
804 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
806 __asm__("bsrl %1, %0" : "=r" (i
) : "rm" (n
));
809 #elif defined(_MSC_VER)
811 _BitScanReverse(&i
, n
);
817 if ((n
& 0xFFFF0000) == 0) {
822 if ((n
& 0xFF000000) == 0) {
827 if ((n
& 0xF0000000) == 0) {
832 if ((n
& 0xC0000000) == 0) {
837 if ((n
& 0x80000000) == 0)
845 static inline uint32_t
848 #if defined(__INTEL_COMPILER)
849 return _bit_scan_reverse(n
) ^ 31U;
851 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
852 return (uint32_t)__builtin_clz(n
);
854 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
856 __asm__("bsrl %1, %0\n\t"
858 : "=r" (i
) : "rm" (n
));
861 #elif defined(_MSC_VER)
863 _BitScanReverse(&i
, n
);
869 if ((n
& 0xFFFF0000) == 0) {
874 if ((n
& 0xFF000000) == 0) {
879 if ((n
& 0xF0000000) == 0) {
884 if ((n
& 0xC0000000) == 0) {
889 if ((n
& 0x80000000) == 0)
897 static inline uint32_t
900 #if defined(__INTEL_COMPILER)
901 return _bit_scan_forward(n
);
903 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
904 return (uint32_t)__builtin_ctz(n
);
906 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
908 __asm__("bsfl %1, %0" : "=r" (i
) : "rm" (n
));
911 #elif defined(_MSC_VER)
913 _BitScanForward(&i
, n
);
919 if ((n
& 0x0000FFFF) == 0) {
924 if ((n
& 0x000000FF) == 0) {
929 if ((n
& 0x0000000F) == 0) {
934 if ((n
& 0x00000003) == 0) {
939 if ((n
& 0x00000001) == 0)