liblzma: CRC CLMUL: Omit is_arch_extension_supported() when not needed
[xz/debian.git] / src / common / tuklib_integer.h
blob4026249e5468b1b9e5cb117338dfe0986c3d3a8e
1 // SPDX-License-Identifier: 0BSD
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file tuklib_integer.h
6 /// \brief Various integer and bit operations
7 ///
8 /// This file provides macros or functions to do some basic integer and bit
9 /// operations.
10 ///
11 /// Native endian inline functions (XX = 16, 32, or 64):
12 /// - Unaligned native endian reads: readXXne(ptr)
13 /// - Unaligned native endian writes: writeXXne(ptr, num)
14 /// - Aligned native endian reads: aligned_readXXne(ptr)
15 /// - Aligned native endian writes: aligned_writeXXne(ptr, num)
16 ///
17 /// Endianness-converting integer operations (these can be macros!)
18 /// (XX = 16, 32, or 64; Y = b or l):
19 /// - Byte swapping: byteswapXX(num)
20 /// - Byte order conversions to/from native (byteswaps if Y isn't
21 /// the native endianness): convXXYe(num)
22 /// - Unaligned reads: readXXYe(ptr)
23 /// - Unaligned writes: writeXXYe(ptr, num)
24 /// - Aligned reads: aligned_readXXYe(ptr)
25 /// - Aligned writes: aligned_writeXXYe(ptr, num)
26 ///
27 /// Since the above can macros, the arguments should have no side effects
28 /// because they may be evaluated more than once.
29 ///
30 /// Bit scan operations for non-zero 32-bit integers (inline functions):
31 /// - Bit scan reverse (find highest non-zero bit): bsr32(num)
32 /// - Count leading zeros: clz32(num)
33 /// - Count trailing zeros: ctz32(num)
34 /// - Bit scan forward (simply an alias for ctz32()): bsf32(num)
35 ///
36 /// The above bit scan operations return 0-31. If num is zero,
37 /// the result is undefined.
39 // Authors: Lasse Collin
40 // Joachim Henke
42 ///////////////////////////////////////////////////////////////////////////////
44 #ifndef TUKLIB_INTEGER_H
45 #define TUKLIB_INTEGER_H
47 #include "tuklib_common.h"
48 #include <string.h>
50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
51 // and such functions.
52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
53 # include <immintrin.h>
54 // Only include <intrin.h> when it is needed. GCC and Clang can both
55 // use __builtin's, so we only need Windows instrincs when using MSVC.
56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
57 // cases explicitly.
58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
59 # include <intrin.h>
60 #endif
63 ///////////////////
64 // Byte swapping //
65 ///////////////////
67 #if defined(HAVE___BUILTIN_BSWAPXX)
68 // GCC >= 4.8 and Clang
69 # define byteswap16(num) __builtin_bswap16(num)
70 # define byteswap32(num) __builtin_bswap32(num)
71 # define byteswap64(num) __builtin_bswap64(num)
73 #elif defined(HAVE_BYTESWAP_H)
74 // glibc, uClibc, dietlibc
75 # include <byteswap.h>
76 # ifdef HAVE_BSWAP_16
77 # define byteswap16(num) bswap_16(num)
78 # endif
79 # ifdef HAVE_BSWAP_32
80 # define byteswap32(num) bswap_32(num)
81 # endif
82 # ifdef HAVE_BSWAP_64
83 # define byteswap64(num) bswap_64(num)
84 # endif
86 #elif defined(HAVE_SYS_ENDIAN_H)
87 // *BSDs and Darwin
88 # include <sys/endian.h>
89 # ifdef __OpenBSD__
90 # define byteswap16(num) swap16(num)
91 # define byteswap32(num) swap32(num)
92 # define byteswap64(num) swap64(num)
93 # else
94 # define byteswap16(num) bswap16(num)
95 # define byteswap32(num) bswap32(num)
96 # define byteswap64(num) bswap64(num)
97 # endif
99 #elif defined(HAVE_SYS_BYTEORDER_H)
100 // Solaris
101 # include <sys/byteorder.h>
102 # ifdef BSWAP_16
103 # define byteswap16(num) BSWAP_16(num)
104 # endif
105 # ifdef BSWAP_32
106 # define byteswap32(num) BSWAP_32(num)
107 # endif
108 # ifdef BSWAP_64
109 # define byteswap64(num) BSWAP_64(num)
110 # endif
111 # ifdef BE_16
112 # define conv16be(num) BE_16(num)
113 # endif
114 # ifdef BE_32
115 # define conv32be(num) BE_32(num)
116 # endif
117 # ifdef BE_64
118 # define conv64be(num) BE_64(num)
119 # endif
120 # ifdef LE_16
121 # define conv16le(num) LE_16(num)
122 # endif
123 # ifdef LE_32
124 # define conv32le(num) LE_32(num)
125 # endif
126 # ifdef LE_64
127 # define conv64le(num) LE_64(num)
128 # endif
129 #endif
131 #ifndef byteswap16
132 # define byteswap16(n) (uint16_t)( \
133 (((n) & 0x00FFU) << 8) \
134 | (((n) & 0xFF00U) >> 8) \
136 #endif
138 #ifndef byteswap32
139 # define byteswap32(n) (uint32_t)( \
140 (((n) & UINT32_C(0x000000FF)) << 24) \
141 | (((n) & UINT32_C(0x0000FF00)) << 8) \
142 | (((n) & UINT32_C(0x00FF0000)) >> 8) \
143 | (((n) & UINT32_C(0xFF000000)) >> 24) \
145 #endif
147 #ifndef byteswap64
148 # define byteswap64(n) (uint64_t)( \
149 (((n) & UINT64_C(0x00000000000000FF)) << 56) \
150 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \
151 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
152 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \
153 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
154 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
155 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
156 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
158 #endif
160 // Define conversion macros using the basic byte swapping macros.
161 #ifdef WORDS_BIGENDIAN
162 # ifndef conv16be
163 # define conv16be(num) ((uint16_t)(num))
164 # endif
165 # ifndef conv32be
166 # define conv32be(num) ((uint32_t)(num))
167 # endif
168 # ifndef conv64be
169 # define conv64be(num) ((uint64_t)(num))
170 # endif
171 # ifndef conv16le
172 # define conv16le(num) byteswap16(num)
173 # endif
174 # ifndef conv32le
175 # define conv32le(num) byteswap32(num)
176 # endif
177 # ifndef conv64le
178 # define conv64le(num) byteswap64(num)
179 # endif
180 #else
181 # ifndef conv16be
182 # define conv16be(num) byteswap16(num)
183 # endif
184 # ifndef conv32be
185 # define conv32be(num) byteswap32(num)
186 # endif
187 # ifndef conv64be
188 # define conv64be(num) byteswap64(num)
189 # endif
190 # ifndef conv16le
191 # define conv16le(num) ((uint16_t)(num))
192 # endif
193 # ifndef conv32le
194 # define conv32le(num) ((uint32_t)(num))
195 # endif
196 # ifndef conv64le
197 # define conv64le(num) ((uint64_t)(num))
198 # endif
199 #endif
202 ////////////////////////////////
203 // Unaligned reads and writes //
204 ////////////////////////////////
206 // No-strict-align archs like x86-64
207 // ---------------------------------
209 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
210 // is bad even if the uint8_pointer is properly aligned because this kind
211 // of casts break strict aliasing rules and result in undefined behavior.
212 // With unaligned pointers it's even worse: compilers may emit vector
213 // instructions that require aligned pointers even if non-vector
214 // instructions work with unaligned pointers.
216 // Using memcpy() is the standard compliant way to do unaligned access.
217 // Many modern compilers inline it so there is no function call overhead.
218 // For those compilers that don't handle the memcpy() method well, the
219 // old casting method (that violates strict aliasing) can be requested at
220 // build time. A third method, casting to a packed struct, would also be
221 // an option but isn't provided to keep things simpler (it's already a mess).
222 // Hopefully this is flexible enough in practice.
224 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
226 // buf[0] | (buf[1] << 8)
228 // reads a 16-bit value and can emit a single 16-bit load and produce
229 // identical code than with the memcpy() method. In other cases Clang and GCC
230 // produce either the same or better code with memcpy(). For example, Clang 9
231 // on x86-64 can detect 32-bit load but not 16-bit load.
233 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
234 // code for "buf[0] | (buf[1] << 8)".
236 // Conclusion: The memcpy() method is the best choice when unaligned access
237 // is supported.
239 // Strict-align archs like SPARC
240 // -----------------------------
242 // GCC versions from around 4.x to to at least 13.2.0 produce worse code
243 // from the memcpy() method than from simple byte-by-byte shift-or code
244 // when reading a 32-bit integer:
246 // (1) It may be constructed on stack using four 8-bit loads,
247 // four 8-bit stores to stack, and finally one 32-bit load from stack.
249 // (2) Especially with -Os, an actual memcpy() call may be emitted.
251 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
252 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
253 // some processors but not all so this is relevant only in the case when
254 // GCC assumes that unaligned is not supported or -mstrict-align or
255 // -mno-unaligned-access is used.
257 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
258 // was one the very few with a minor difference: the memcpy() version
259 // was one instruction longer.
261 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
262 // the best choice for strict-align archs to do unaligned access.
264 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
266 // Thanks to <https://godbolt.org/> it was easy to test different compilers.
267 // The following is for little endian targets:
269 #include <stdint.h>
270 #include <string.h>
272 uint32_t bytes16(const uint8_t *b)
274 return (uint32_t)b[0]
275 | ((uint32_t)b[1] << 8);
278 uint32_t copy16(const uint8_t *b)
280 uint16_t v;
281 memcpy(&v, b, sizeof(v));
282 return v;
285 uint32_t bytes32(const uint8_t *b)
287 return (uint32_t)b[0]
288 | ((uint32_t)b[1] << 8)
289 | ((uint32_t)b[2] << 16)
290 | ((uint32_t)b[3] << 24);
293 uint32_t copy32(const uint8_t *b)
295 uint32_t v;
296 memcpy(&v, b, sizeof(v));
297 return v;
300 void wbytes16(uint8_t *b, uint16_t v)
302 b[0] = (uint8_t)v;
303 b[1] = (uint8_t)(v >> 8);
306 void wcopy16(uint8_t *b, uint16_t v)
308 memcpy(b, &v, sizeof(v));
311 void wbytes32(uint8_t *b, uint32_t v)
313 b[0] = (uint8_t)v;
314 b[1] = (uint8_t)(v >> 8);
315 b[2] = (uint8_t)(v >> 16);
316 b[3] = (uint8_t)(v >> 24);
319 void wcopy32(uint8_t *b, uint32_t v)
321 memcpy(b, &v, sizeof(v));
326 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
328 static inline uint16_t
329 read16ne(const uint8_t *buf)
331 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
332 return *(const uint16_t *)buf;
333 #else
334 uint16_t num;
335 memcpy(&num, buf, sizeof(num));
336 return num;
337 #endif
341 static inline uint32_t
342 read32ne(const uint8_t *buf)
344 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
345 return *(const uint32_t *)buf;
346 #else
347 uint32_t num;
348 memcpy(&num, buf, sizeof(num));
349 return num;
350 #endif
354 static inline uint64_t
355 read64ne(const uint8_t *buf)
357 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
358 return *(const uint64_t *)buf;
359 #else
360 uint64_t num;
361 memcpy(&num, buf, sizeof(num));
362 return num;
363 #endif
367 static inline void
368 write16ne(uint8_t *buf, uint16_t num)
370 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
371 *(uint16_t *)buf = num;
372 #else
373 memcpy(buf, &num, sizeof(num));
374 #endif
375 return;
379 static inline void
380 write32ne(uint8_t *buf, uint32_t num)
382 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
383 *(uint32_t *)buf = num;
384 #else
385 memcpy(buf, &num, sizeof(num));
386 #endif
387 return;
391 static inline void
392 write64ne(uint8_t *buf, uint64_t num)
394 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
395 *(uint64_t *)buf = num;
396 #else
397 memcpy(buf, &num, sizeof(num));
398 #endif
399 return;
403 static inline uint16_t
404 read16be(const uint8_t *buf)
406 uint16_t num = read16ne(buf);
407 return conv16be(num);
411 static inline uint16_t
412 read16le(const uint8_t *buf)
414 uint16_t num = read16ne(buf);
415 return conv16le(num);
419 static inline uint32_t
420 read32be(const uint8_t *buf)
422 uint32_t num = read32ne(buf);
423 return conv32be(num);
427 static inline uint32_t
428 read32le(const uint8_t *buf)
430 uint32_t num = read32ne(buf);
431 return conv32le(num);
435 static inline uint64_t
436 read64be(const uint8_t *buf)
438 uint64_t num = read64ne(buf);
439 return conv64be(num);
443 static inline uint64_t
444 read64le(const uint8_t *buf)
446 uint64_t num = read64ne(buf);
447 return conv64le(num);
451 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
452 // to optimize byte swapping of constants when using glibc's or *BSD's
453 // byte swapping macros. The actual write is done in an inline function
454 // to make type checking of the buf pointer possible.
455 #define write16be(buf, num) write16ne(buf, conv16be(num))
456 #define write32be(buf, num) write32ne(buf, conv32be(num))
457 #define write64be(buf, num) write64ne(buf, conv64be(num))
458 #define write16le(buf, num) write16ne(buf, conv16le(num))
459 #define write32le(buf, num) write32ne(buf, conv32le(num))
460 #define write64le(buf, num) write64ne(buf, conv64le(num))
462 #else
464 #ifdef WORDS_BIGENDIAN
465 # define read16ne read16be
466 # define read32ne read32be
467 # define read64ne read64be
468 # define write16ne write16be
469 # define write32ne write32be
470 # define write64ne write64be
471 #else
472 # define read16ne read16le
473 # define read32ne read32le
474 # define read64ne read64le
475 # define write16ne write16le
476 # define write32ne write32le
477 # define write64ne write64le
478 #endif
481 static inline uint16_t
482 read16be(const uint8_t *buf)
484 uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
485 return num;
489 static inline uint16_t
490 read16le(const uint8_t *buf)
492 uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
493 return num;
497 static inline uint32_t
498 read32be(const uint8_t *buf)
500 uint32_t num = (uint32_t)buf[0] << 24;
501 num |= (uint32_t)buf[1] << 16;
502 num |= (uint32_t)buf[2] << 8;
503 num |= (uint32_t)buf[3];
504 return num;
508 static inline uint32_t
509 read32le(const uint8_t *buf)
511 uint32_t num = (uint32_t)buf[0];
512 num |= (uint32_t)buf[1] << 8;
513 num |= (uint32_t)buf[2] << 16;
514 num |= (uint32_t)buf[3] << 24;
515 return num;
519 static inline uint64_t
520 read64be(const uint8_t *buf)
522 uint64_t num = (uint64_t)buf[0] << 56;
523 num |= (uint64_t)buf[1] << 48;
524 num |= (uint64_t)buf[2] << 40;
525 num |= (uint64_t)buf[3] << 32;
526 num |= (uint64_t)buf[4] << 24;
527 num |= (uint64_t)buf[5] << 16;
528 num |= (uint64_t)buf[6] << 8;
529 num |= (uint64_t)buf[7];
530 return num;
534 static inline uint64_t
535 read64le(const uint8_t *buf)
537 uint64_t num = (uint64_t)buf[0];
538 num |= (uint64_t)buf[1] << 8;
539 num |= (uint64_t)buf[2] << 16;
540 num |= (uint64_t)buf[3] << 24;
541 num |= (uint64_t)buf[4] << 32;
542 num |= (uint64_t)buf[5] << 40;
543 num |= (uint64_t)buf[6] << 48;
544 num |= (uint64_t)buf[7] << 56;
545 return num;
549 static inline void
550 write16be(uint8_t *buf, uint16_t num)
552 buf[0] = (uint8_t)(num >> 8);
553 buf[1] = (uint8_t)num;
554 return;
558 static inline void
559 write16le(uint8_t *buf, uint16_t num)
561 buf[0] = (uint8_t)num;
562 buf[1] = (uint8_t)(num >> 8);
563 return;
567 static inline void
568 write32be(uint8_t *buf, uint32_t num)
570 buf[0] = (uint8_t)(num >> 24);
571 buf[1] = (uint8_t)(num >> 16);
572 buf[2] = (uint8_t)(num >> 8);
573 buf[3] = (uint8_t)num;
574 return;
578 static inline void
579 write32le(uint8_t *buf, uint32_t num)
581 buf[0] = (uint8_t)num;
582 buf[1] = (uint8_t)(num >> 8);
583 buf[2] = (uint8_t)(num >> 16);
584 buf[3] = (uint8_t)(num >> 24);
585 return;
589 static inline void
590 write64be(uint8_t *buf, uint64_t num)
592 buf[0] = (uint8_t)(num >> 56);
593 buf[1] = (uint8_t)(num >> 48);
594 buf[2] = (uint8_t)(num >> 40);
595 buf[3] = (uint8_t)(num >> 32);
596 buf[4] = (uint8_t)(num >> 24);
597 buf[5] = (uint8_t)(num >> 16);
598 buf[6] = (uint8_t)(num >> 8);
599 buf[7] = (uint8_t)num;
600 return;
604 static inline void
605 write64le(uint8_t *buf, uint64_t num)
607 buf[0] = (uint8_t)num;
608 buf[1] = (uint8_t)(num >> 8);
609 buf[2] = (uint8_t)(num >> 16);
610 buf[3] = (uint8_t)(num >> 24);
611 buf[4] = (uint8_t)(num >> 32);
612 buf[5] = (uint8_t)(num >> 40);
613 buf[6] = (uint8_t)(num >> 48);
614 buf[7] = (uint8_t)(num >> 56);
615 return;
618 #endif
621 //////////////////////////////
622 // Aligned reads and writes //
623 //////////////////////////////
625 // Separate functions for aligned reads and writes are provided since on
626 // strict-align archs aligned access is much faster than unaligned access.
628 // Just like in the unaligned case, memcpy() is needed to avoid
629 // strict aliasing violations. However, on archs that don't support
630 // unaligned access the compiler cannot know that the pointers given
631 // to memcpy() are aligned which results in slow code. As of C11 there is
632 // no standard way to tell the compiler that we know that the address is
633 // aligned but some compilers have language extensions to do that. With
634 // such language extensions the memcpy() method gives excellent results.
636 // What to do on a strict-align system when no known language extensions
637 // are available? Falling back to byte-by-byte access would be safe but ruin
638 // optimizations that have been made specifically with aligned access in mind.
639 // As a compromise, aligned reads will fall back to non-compliant type punning
640 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
641 // over fast writes. This obviously isn't great but hopefully it's a working
642 // compromise for now.
644 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
645 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
646 # define tuklib_memcpy_aligned(dest, src, size) \
647 memcpy(dest, __builtin_assume_aligned(src, size), size)
648 #else
649 # define tuklib_memcpy_aligned(dest, src, size) \
650 memcpy(dest, src, size)
651 # ifndef TUKLIB_FAST_UNALIGNED_ACCESS
652 # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
653 # endif
654 #endif
657 static inline uint16_t
658 aligned_read16ne(const uint8_t *buf)
660 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
661 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
662 return *(const uint16_t *)buf;
663 #else
664 uint16_t num;
665 tuklib_memcpy_aligned(&num, buf, sizeof(num));
666 return num;
667 #endif
671 static inline uint32_t
672 aligned_read32ne(const uint8_t *buf)
674 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
675 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
676 return *(const uint32_t *)buf;
677 #else
678 uint32_t num;
679 tuklib_memcpy_aligned(&num, buf, sizeof(num));
680 return num;
681 #endif
685 static inline uint64_t
686 aligned_read64ne(const uint8_t *buf)
688 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
689 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
690 return *(const uint64_t *)buf;
691 #else
692 uint64_t num;
693 tuklib_memcpy_aligned(&num, buf, sizeof(num));
694 return num;
695 #endif
699 static inline void
700 aligned_write16ne(uint8_t *buf, uint16_t num)
702 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
703 *(uint16_t *)buf = num;
704 #else
705 tuklib_memcpy_aligned(buf, &num, sizeof(num));
706 #endif
707 return;
711 static inline void
712 aligned_write32ne(uint8_t *buf, uint32_t num)
714 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
715 *(uint32_t *)buf = num;
716 #else
717 tuklib_memcpy_aligned(buf, &num, sizeof(num));
718 #endif
719 return;
723 static inline void
724 aligned_write64ne(uint8_t *buf, uint64_t num)
726 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
727 *(uint64_t *)buf = num;
728 #else
729 tuklib_memcpy_aligned(buf, &num, sizeof(num));
730 #endif
731 return;
735 static inline uint16_t
736 aligned_read16be(const uint8_t *buf)
738 uint16_t num = aligned_read16ne(buf);
739 return conv16be(num);
743 static inline uint16_t
744 aligned_read16le(const uint8_t *buf)
746 uint16_t num = aligned_read16ne(buf);
747 return conv16le(num);
751 static inline uint32_t
752 aligned_read32be(const uint8_t *buf)
754 uint32_t num = aligned_read32ne(buf);
755 return conv32be(num);
759 static inline uint32_t
760 aligned_read32le(const uint8_t *buf)
762 uint32_t num = aligned_read32ne(buf);
763 return conv32le(num);
767 static inline uint64_t
768 aligned_read64be(const uint8_t *buf)
770 uint64_t num = aligned_read64ne(buf);
771 return conv64be(num);
775 static inline uint64_t
776 aligned_read64le(const uint8_t *buf)
778 uint64_t num = aligned_read64ne(buf);
779 return conv64le(num);
783 // These need to be macros like in the unaligned case.
784 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
785 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
786 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
787 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
788 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
789 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
792 ////////////////////
793 // Bit operations //
794 ////////////////////
796 static inline uint32_t
797 bsr32(uint32_t n)
799 // Check for ICC first, since it tends to define __GNUC__ too.
800 #if defined(__INTEL_COMPILER)
801 return _bit_scan_reverse(n);
803 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
804 // GCC >= 3.4 has __builtin_clz(), which gives good results on
805 // multiple architectures. On x86, __builtin_clz() ^ 31U becomes
806 // either plain BSR (so the XOR gets optimized away) or LZCNT and
807 // XOR (if -march indicates that SSE4a instructions are supported).
808 return (uint32_t)__builtin_clz(n) ^ 31U;
810 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
811 uint32_t i;
812 __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
813 return i;
815 #elif defined(_MSC_VER)
816 unsigned long i;
817 _BitScanReverse(&i, n);
818 return i;
820 #else
821 uint32_t i = 31;
823 if ((n & 0xFFFF0000) == 0) {
824 n <<= 16;
825 i = 15;
828 if ((n & 0xFF000000) == 0) {
829 n <<= 8;
830 i -= 8;
833 if ((n & 0xF0000000) == 0) {
834 n <<= 4;
835 i -= 4;
838 if ((n & 0xC0000000) == 0) {
839 n <<= 2;
840 i -= 2;
843 if ((n & 0x80000000) == 0)
844 --i;
846 return i;
847 #endif
851 static inline uint32_t
852 clz32(uint32_t n)
854 #if defined(__INTEL_COMPILER)
855 return _bit_scan_reverse(n) ^ 31U;
857 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
858 return (uint32_t)__builtin_clz(n);
860 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
861 uint32_t i;
862 __asm__("bsrl %1, %0\n\t"
863 "xorl $31, %0"
864 : "=r" (i) : "rm" (n));
865 return i;
867 #elif defined(_MSC_VER)
868 unsigned long i;
869 _BitScanReverse(&i, n);
870 return i ^ 31U;
872 #else
873 uint32_t i = 0;
875 if ((n & 0xFFFF0000) == 0) {
876 n <<= 16;
877 i = 16;
880 if ((n & 0xFF000000) == 0) {
881 n <<= 8;
882 i += 8;
885 if ((n & 0xF0000000) == 0) {
886 n <<= 4;
887 i += 4;
890 if ((n & 0xC0000000) == 0) {
891 n <<= 2;
892 i += 2;
895 if ((n & 0x80000000) == 0)
896 ++i;
898 return i;
899 #endif
903 static inline uint32_t
904 ctz32(uint32_t n)
906 #if defined(__INTEL_COMPILER)
907 return _bit_scan_forward(n);
909 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
910 return (uint32_t)__builtin_ctz(n);
912 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
913 uint32_t i;
914 __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
915 return i;
917 #elif defined(_MSC_VER)
918 unsigned long i;
919 _BitScanForward(&i, n);
920 return i;
922 #else
923 uint32_t i = 0;
925 if ((n & 0x0000FFFF) == 0) {
926 n >>= 16;
927 i = 16;
930 if ((n & 0x000000FF) == 0) {
931 n >>= 8;
932 i += 8;
935 if ((n & 0x0000000F) == 0) {
936 n >>= 4;
937 i += 4;
940 if ((n & 0x00000003) == 0) {
941 n >>= 2;
942 i += 2;
945 if ((n & 0x00000001) == 0)
946 ++i;
948 return i;
949 #endif
952 #define bsf32 ctz32
954 #endif