1 // SPDX-License-Identifier: 0BSD
3 ///////////////////////////////////////////////////////////////////////////////
5 /// \file tuklib_integer.h
6 /// \brief Various integer and bit operations
8 /// This file provides macros or functions to do some basic integer and bit
11 /// Native endian inline functions (XX = 16, 32, or 64):
12 /// - Unaligned native endian reads: readXXne(ptr)
13 /// - Unaligned native endian writes: writeXXne(ptr, num)
14 /// - Aligned native endian reads: aligned_readXXne(ptr)
15 /// - Aligned native endian writes: aligned_writeXXne(ptr, num)
17 /// Endianness-converting integer operations (these can be macros!)
18 /// (XX = 16, 32, or 64; Y = b or l):
19 /// - Byte swapping: byteswapXX(num)
20 /// - Byte order conversions to/from native (byteswaps if Y isn't
21 /// the native endianness): convXXYe(num)
22 /// - Unaligned reads: readXXYe(ptr)
23 /// - Unaligned writes: writeXXYe(ptr, num)
24 /// - Aligned reads: aligned_readXXYe(ptr)
25 /// - Aligned writes: aligned_writeXXYe(ptr, num)
27 /// Since the above can macros, the arguments should have no side effects
28 /// because they may be evaluated more than once.
30 /// Bit scan operations for non-zero 32-bit integers (inline functions):
31 /// - Bit scan reverse (find highest non-zero bit): bsr32(num)
32 /// - Count leading zeros: clz32(num)
33 /// - Count trailing zeros: ctz32(num)
34 /// - Bit scan forward (simply an alias for ctz32()): bsf32(num)
36 /// The above bit scan operations return 0-31. If num is zero,
37 /// the result is undefined.
39 // Authors: Lasse Collin
42 ///////////////////////////////////////////////////////////////////////////////
44 #ifndef TUKLIB_INTEGER_H
45 #define TUKLIB_INTEGER_H
47 #include "tuklib_common.h"
50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
51 // and such functions.
52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
53 # include <immintrin.h>
54 // Only include <intrin.h> when it is needed. GCC and Clang can both
55 // use __builtin's, so we only need Windows instrincs when using MSVC.
56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
67 #if defined(HAVE___BUILTIN_BSWAPXX)
68 // GCC >= 4.8 and Clang
69 # define byteswap16(num) __builtin_bswap16(num)
70 # define byteswap32(num) __builtin_bswap32(num)
71 # define byteswap64(num) __builtin_bswap64(num)
73 #elif defined(HAVE_BYTESWAP_H)
74 // glibc, uClibc, dietlibc
75 # include <byteswap.h>
77 # define byteswap16(num) bswap_16(num)
80 # define byteswap32(num) bswap_32(num)
83 # define byteswap64(num) bswap_64(num)
86 #elif defined(HAVE_SYS_ENDIAN_H)
88 # include <sys/endian.h>
90 # define byteswap16(num) swap16(num)
91 # define byteswap32(num) swap32(num)
92 # define byteswap64(num) swap64(num)
94 # define byteswap16(num) bswap16(num)
95 # define byteswap32(num) bswap32(num)
96 # define byteswap64(num) bswap64(num)
99 #elif defined(HAVE_SYS_BYTEORDER_H)
101 # include <sys/byteorder.h>
103 # define byteswap16(num) BSWAP_16(num)
106 # define byteswap32(num) BSWAP_32(num)
109 # define byteswap64(num) BSWAP_64(num)
112 # define conv16be(num) BE_16(num)
115 # define conv32be(num) BE_32(num)
118 # define conv64be(num) BE_64(num)
121 # define conv16le(num) LE_16(num)
124 # define conv32le(num) LE_32(num)
127 # define conv64le(num) LE_64(num)
132 # define byteswap16(n) (uint16_t)( \
133 (((n) & 0x00FFU) << 8) \
134 | (((n) & 0xFF00U) >> 8) \
139 # define byteswap32(n) (uint32_t)( \
140 (((n) & UINT32_C(0x000000FF)) << 24) \
141 | (((n) & UINT32_C(0x0000FF00)) << 8) \
142 | (((n) & UINT32_C(0x00FF0000)) >> 8) \
143 | (((n) & UINT32_C(0xFF000000)) >> 24) \
148 # define byteswap64(n) (uint64_t)( \
149 (((n) & UINT64_C(0x00000000000000FF)) << 56) \
150 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \
151 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
152 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \
153 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
154 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
155 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
156 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
160 // Define conversion macros using the basic byte swapping macros.
161 #ifdef WORDS_BIGENDIAN
163 # define conv16be(num) ((uint16_t)(num))
166 # define conv32be(num) ((uint32_t)(num))
169 # define conv64be(num) ((uint64_t)(num))
172 # define conv16le(num) byteswap16(num)
175 # define conv32le(num) byteswap32(num)
178 # define conv64le(num) byteswap64(num)
182 # define conv16be(num) byteswap16(num)
185 # define conv32be(num) byteswap32(num)
188 # define conv64be(num) byteswap64(num)
191 # define conv16le(num) ((uint16_t)(num))
194 # define conv32le(num) ((uint32_t)(num))
197 # define conv64le(num) ((uint64_t)(num))
202 ////////////////////////////////
203 // Unaligned reads and writes //
204 ////////////////////////////////
206 // No-strict-align archs like x86-64
207 // ---------------------------------
209 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
210 // is bad even if the uint8_pointer is properly aligned because this kind
211 // of casts break strict aliasing rules and result in undefined behavior.
212 // With unaligned pointers it's even worse: compilers may emit vector
213 // instructions that require aligned pointers even if non-vector
214 // instructions work with unaligned pointers.
216 // Using memcpy() is the standard compliant way to do unaligned access.
217 // Many modern compilers inline it so there is no function call overhead.
218 // For those compilers that don't handle the memcpy() method well, the
219 // old casting method (that violates strict aliasing) can be requested at
220 // build time. A third method, casting to a packed struct, would also be
221 // an option but isn't provided to keep things simpler (it's already a mess).
222 // Hopefully this is flexible enough in practice.
224 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
226 // buf[0] | (buf[1] << 8)
228 // reads a 16-bit value and can emit a single 16-bit load and produce
229 // identical code than with the memcpy() method. In other cases Clang and GCC
230 // produce either the same or better code with memcpy(). For example, Clang 9
231 // on x86-64 can detect 32-bit load but not 16-bit load.
233 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
234 // code for "buf[0] | (buf[1] << 8)".
236 // Conclusion: The memcpy() method is the best choice when unaligned access
239 // Strict-align archs like SPARC
240 // -----------------------------
242 // GCC versions from around 4.x to to at least 13.2.0 produce worse code
243 // from the memcpy() method than from simple byte-by-byte shift-or code
244 // when reading a 32-bit integer:
246 // (1) It may be constructed on stack using four 8-bit loads,
247 // four 8-bit stores to stack, and finally one 32-bit load from stack.
249 // (2) Especially with -Os, an actual memcpy() call may be emitted.
251 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
252 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
253 // some processors but not all so this is relevant only in the case when
254 // GCC assumes that unaligned is not supported or -mstrict-align or
255 // -mno-unaligned-access is used.
257 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
258 // was one the very few with a minor difference: the memcpy() version
259 // was one instruction longer.
261 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
262 // the best choice for strict-align archs to do unaligned access.
264 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
266 // Thanks to <https://godbolt.org/> it was easy to test different compilers.
267 // The following is for little endian targets:
272 uint32_t bytes16(const uint8_t *b)
274 return (uint32_t)b[0]
275 | ((uint32_t)b[1] << 8);
278 uint32_t copy16(const uint8_t *b)
281 memcpy(&v, b, sizeof(v));
285 uint32_t bytes32(const uint8_t *b)
287 return (uint32_t)b[0]
288 | ((uint32_t)b[1] << 8)
289 | ((uint32_t)b[2] << 16)
290 | ((uint32_t)b[3] << 24);
293 uint32_t copy32(const uint8_t *b)
296 memcpy(&v, b, sizeof(v));
300 void wbytes16(uint8_t *b, uint16_t v)
303 b[1] = (uint8_t)(v >> 8);
306 void wcopy16(uint8_t *b, uint16_t v)
308 memcpy(b, &v, sizeof(v));
311 void wbytes32(uint8_t *b, uint32_t v)
314 b[1] = (uint8_t)(v >> 8);
315 b[2] = (uint8_t)(v >> 16);
316 b[3] = (uint8_t)(v >> 24);
319 void wcopy32(uint8_t *b, uint32_t v)
321 memcpy(b, &v, sizeof(v));
326 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
328 static inline uint16_t
329 read16ne(const uint8_t *buf
)
331 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
332 return *(const uint16_t *)buf
;
335 memcpy(&num
, buf
, sizeof(num
));
341 static inline uint32_t
342 read32ne(const uint8_t *buf
)
344 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
345 return *(const uint32_t *)buf
;
348 memcpy(&num
, buf
, sizeof(num
));
354 static inline uint64_t
355 read64ne(const uint8_t *buf
)
357 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
358 return *(const uint64_t *)buf
;
361 memcpy(&num
, buf
, sizeof(num
));
368 write16ne(uint8_t *buf
, uint16_t num
)
370 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
371 *(uint16_t *)buf
= num
;
373 memcpy(buf
, &num
, sizeof(num
));
380 write32ne(uint8_t *buf
, uint32_t num
)
382 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
383 *(uint32_t *)buf
= num
;
385 memcpy(buf
, &num
, sizeof(num
));
392 write64ne(uint8_t *buf
, uint64_t num
)
394 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
395 *(uint64_t *)buf
= num
;
397 memcpy(buf
, &num
, sizeof(num
));
403 static inline uint16_t
404 read16be(const uint8_t *buf
)
406 uint16_t num
= read16ne(buf
);
407 return conv16be(num
);
411 static inline uint16_t
412 read16le(const uint8_t *buf
)
414 uint16_t num
= read16ne(buf
);
415 return conv16le(num
);
419 static inline uint32_t
420 read32be(const uint8_t *buf
)
422 uint32_t num
= read32ne(buf
);
423 return conv32be(num
);
427 static inline uint32_t
428 read32le(const uint8_t *buf
)
430 uint32_t num
= read32ne(buf
);
431 return conv32le(num
);
435 static inline uint64_t
436 read64be(const uint8_t *buf
)
438 uint64_t num
= read64ne(buf
);
439 return conv64be(num
);
443 static inline uint64_t
444 read64le(const uint8_t *buf
)
446 uint64_t num
= read64ne(buf
);
447 return conv64le(num
);
451 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
452 // to optimize byte swapping of constants when using glibc's or *BSD's
453 // byte swapping macros. The actual write is done in an inline function
454 // to make type checking of the buf pointer possible.
455 #define write16be(buf, num) write16ne(buf, conv16be(num))
456 #define write32be(buf, num) write32ne(buf, conv32be(num))
457 #define write64be(buf, num) write64ne(buf, conv64be(num))
458 #define write16le(buf, num) write16ne(buf, conv16le(num))
459 #define write32le(buf, num) write32ne(buf, conv32le(num))
460 #define write64le(buf, num) write64ne(buf, conv64le(num))
464 #ifdef WORDS_BIGENDIAN
465 # define read16ne read16be
466 # define read32ne read32be
467 # define read64ne read64be
468 # define write16ne write16be
469 # define write32ne write32be
470 # define write64ne write64be
472 # define read16ne read16le
473 # define read32ne read32le
474 # define read64ne read64le
475 # define write16ne write16le
476 # define write32ne write32le
477 # define write64ne write64le
481 static inline uint16_t
482 read16be(const uint8_t *buf
)
484 uint16_t num
= ((uint16_t)buf
[0] << 8) | (uint16_t)buf
[1];
489 static inline uint16_t
490 read16le(const uint8_t *buf
)
492 uint16_t num
= ((uint16_t)buf
[0]) | ((uint16_t)buf
[1] << 8);
497 static inline uint32_t
498 read32be(const uint8_t *buf
)
500 uint32_t num
= (uint32_t)buf
[0] << 24;
501 num
|= (uint32_t)buf
[1] << 16;
502 num
|= (uint32_t)buf
[2] << 8;
503 num
|= (uint32_t)buf
[3];
508 static inline uint32_t
509 read32le(const uint8_t *buf
)
511 uint32_t num
= (uint32_t)buf
[0];
512 num
|= (uint32_t)buf
[1] << 8;
513 num
|= (uint32_t)buf
[2] << 16;
514 num
|= (uint32_t)buf
[3] << 24;
519 static inline uint64_t
520 read64be(const uint8_t *buf
)
522 uint64_t num
= (uint64_t)buf
[0] << 56;
523 num
|= (uint64_t)buf
[1] << 48;
524 num
|= (uint64_t)buf
[2] << 40;
525 num
|= (uint64_t)buf
[3] << 32;
526 num
|= (uint64_t)buf
[4] << 24;
527 num
|= (uint64_t)buf
[5] << 16;
528 num
|= (uint64_t)buf
[6] << 8;
529 num
|= (uint64_t)buf
[7];
534 static inline uint64_t
535 read64le(const uint8_t *buf
)
537 uint64_t num
= (uint64_t)buf
[0];
538 num
|= (uint64_t)buf
[1] << 8;
539 num
|= (uint64_t)buf
[2] << 16;
540 num
|= (uint64_t)buf
[3] << 24;
541 num
|= (uint64_t)buf
[4] << 32;
542 num
|= (uint64_t)buf
[5] << 40;
543 num
|= (uint64_t)buf
[6] << 48;
544 num
|= (uint64_t)buf
[7] << 56;
550 write16be(uint8_t *buf
, uint16_t num
)
552 buf
[0] = (uint8_t)(num
>> 8);
553 buf
[1] = (uint8_t)num
;
559 write16le(uint8_t *buf
, uint16_t num
)
561 buf
[0] = (uint8_t)num
;
562 buf
[1] = (uint8_t)(num
>> 8);
568 write32be(uint8_t *buf
, uint32_t num
)
570 buf
[0] = (uint8_t)(num
>> 24);
571 buf
[1] = (uint8_t)(num
>> 16);
572 buf
[2] = (uint8_t)(num
>> 8);
573 buf
[3] = (uint8_t)num
;
579 write32le(uint8_t *buf
, uint32_t num
)
581 buf
[0] = (uint8_t)num
;
582 buf
[1] = (uint8_t)(num
>> 8);
583 buf
[2] = (uint8_t)(num
>> 16);
584 buf
[3] = (uint8_t)(num
>> 24);
590 write64be(uint8_t *buf
, uint64_t num
)
592 buf
[0] = (uint8_t)(num
>> 56);
593 buf
[1] = (uint8_t)(num
>> 48);
594 buf
[2] = (uint8_t)(num
>> 40);
595 buf
[3] = (uint8_t)(num
>> 32);
596 buf
[4] = (uint8_t)(num
>> 24);
597 buf
[5] = (uint8_t)(num
>> 16);
598 buf
[6] = (uint8_t)(num
>> 8);
599 buf
[7] = (uint8_t)num
;
605 write64le(uint8_t *buf
, uint64_t num
)
607 buf
[0] = (uint8_t)num
;
608 buf
[1] = (uint8_t)(num
>> 8);
609 buf
[2] = (uint8_t)(num
>> 16);
610 buf
[3] = (uint8_t)(num
>> 24);
611 buf
[4] = (uint8_t)(num
>> 32);
612 buf
[5] = (uint8_t)(num
>> 40);
613 buf
[6] = (uint8_t)(num
>> 48);
614 buf
[7] = (uint8_t)(num
>> 56);
621 //////////////////////////////
622 // Aligned reads and writes //
623 //////////////////////////////
625 // Separate functions for aligned reads and writes are provided since on
626 // strict-align archs aligned access is much faster than unaligned access.
628 // Just like in the unaligned case, memcpy() is needed to avoid
629 // strict aliasing violations. However, on archs that don't support
630 // unaligned access the compiler cannot know that the pointers given
631 // to memcpy() are aligned which results in slow code. As of C11 there is
632 // no standard way to tell the compiler that we know that the address is
633 // aligned but some compilers have language extensions to do that. With
634 // such language extensions the memcpy() method gives excellent results.
636 // What to do on a strict-align system when no known language extensions
637 // are available? Falling back to byte-by-byte access would be safe but ruin
638 // optimizations that have been made specifically with aligned access in mind.
639 // As a compromise, aligned reads will fall back to non-compliant type punning
640 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
641 // over fast writes. This obviously isn't great but hopefully it's a working
642 // compromise for now.
644 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
645 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
646 # define tuklib_memcpy_aligned(dest, src, size) \
647 memcpy(dest, __builtin_assume_aligned(src, size), size)
649 # define tuklib_memcpy_aligned(dest, src, size) \
650 memcpy(dest, src, size)
651 # ifndef TUKLIB_FAST_UNALIGNED_ACCESS
652 # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
657 static inline uint16_t
658 aligned_read16ne(const uint8_t *buf
)
660 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
661 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
662 return *(const uint16_t *)buf
;
665 tuklib_memcpy_aligned(&num
, buf
, sizeof(num
));
671 static inline uint32_t
672 aligned_read32ne(const uint8_t *buf
)
674 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
675 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
676 return *(const uint32_t *)buf
;
679 tuklib_memcpy_aligned(&num
, buf
, sizeof(num
));
685 static inline uint64_t
686 aligned_read64ne(const uint8_t *buf
)
688 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
689 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
690 return *(const uint64_t *)buf
;
693 tuklib_memcpy_aligned(&num
, buf
, sizeof(num
));
700 aligned_write16ne(uint8_t *buf
, uint16_t num
)
702 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
703 *(uint16_t *)buf
= num
;
705 tuklib_memcpy_aligned(buf
, &num
, sizeof(num
));
712 aligned_write32ne(uint8_t *buf
, uint32_t num
)
714 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
715 *(uint32_t *)buf
= num
;
717 tuklib_memcpy_aligned(buf
, &num
, sizeof(num
));
724 aligned_write64ne(uint8_t *buf
, uint64_t num
)
726 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
727 *(uint64_t *)buf
= num
;
729 tuklib_memcpy_aligned(buf
, &num
, sizeof(num
));
735 static inline uint16_t
736 aligned_read16be(const uint8_t *buf
)
738 uint16_t num
= aligned_read16ne(buf
);
739 return conv16be(num
);
743 static inline uint16_t
744 aligned_read16le(const uint8_t *buf
)
746 uint16_t num
= aligned_read16ne(buf
);
747 return conv16le(num
);
751 static inline uint32_t
752 aligned_read32be(const uint8_t *buf
)
754 uint32_t num
= aligned_read32ne(buf
);
755 return conv32be(num
);
759 static inline uint32_t
760 aligned_read32le(const uint8_t *buf
)
762 uint32_t num
= aligned_read32ne(buf
);
763 return conv32le(num
);
767 static inline uint64_t
768 aligned_read64be(const uint8_t *buf
)
770 uint64_t num
= aligned_read64ne(buf
);
771 return conv64be(num
);
775 static inline uint64_t
776 aligned_read64le(const uint8_t *buf
)
778 uint64_t num
= aligned_read64ne(buf
);
779 return conv64le(num
);
783 // These need to be macros like in the unaligned case.
784 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
785 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
786 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
787 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
788 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
789 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
796 static inline uint32_t
799 // Check for ICC first, since it tends to define __GNUC__ too.
800 #if defined(__INTEL_COMPILER)
801 return _bit_scan_reverse(n
);
803 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
804 // GCC >= 3.4 has __builtin_clz(), which gives good results on
805 // multiple architectures. On x86, __builtin_clz() ^ 31U becomes
806 // either plain BSR (so the XOR gets optimized away) or LZCNT and
807 // XOR (if -march indicates that SSE4a instructions are supported).
808 return (uint32_t)__builtin_clz(n
) ^ 31U;
810 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
812 __asm__("bsrl %1, %0" : "=r" (i
) : "rm" (n
));
815 #elif defined(_MSC_VER)
817 _BitScanReverse(&i
, n
);
823 if ((n
& 0xFFFF0000) == 0) {
828 if ((n
& 0xFF000000) == 0) {
833 if ((n
& 0xF0000000) == 0) {
838 if ((n
& 0xC0000000) == 0) {
843 if ((n
& 0x80000000) == 0)
851 static inline uint32_t
854 #if defined(__INTEL_COMPILER)
855 return _bit_scan_reverse(n
) ^ 31U;
857 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
858 return (uint32_t)__builtin_clz(n
);
860 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
862 __asm__("bsrl %1, %0\n\t"
864 : "=r" (i
) : "rm" (n
));
867 #elif defined(_MSC_VER)
869 _BitScanReverse(&i
, n
);
875 if ((n
& 0xFFFF0000) == 0) {
880 if ((n
& 0xFF000000) == 0) {
885 if ((n
& 0xF0000000) == 0) {
890 if ((n
& 0xC0000000) == 0) {
895 if ((n
& 0x80000000) == 0)
903 static inline uint32_t
906 #if defined(__INTEL_COMPILER)
907 return _bit_scan_forward(n
);
909 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
910 return (uint32_t)__builtin_ctz(n
);
912 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
914 __asm__("bsfl %1, %0" : "=r" (i
) : "rm" (n
));
917 #elif defined(_MSC_VER)
919 _BitScanForward(&i
, n
);
925 if ((n
& 0x0000FFFF) == 0) {
930 if ((n
& 0x000000FF) == 0) {
935 if ((n
& 0x0000000F) == 0) {
940 if ((n
& 0x00000003) == 0) {
945 if ((n
& 0x00000001) == 0)