src/common/tuklib_integer.h

   1 // SPDX-License-Identifier: 0BSD
   2
   3 ///////////////////////////////////////////////////////////////////////////////
   4 //
   5 /// \file       tuklib_integer.h
   6 /// \brief      Various integer and bit operations
   7 ///
   8 /// This file provides macros or functions to do some basic integer and bit
   9 /// operations.
  10 ///
  11 /// Native endian inline functions (XX = 16, 32, or 64):
  12 ///   - Unaligned native endian reads: readXXne(ptr)
  13 ///   - Unaligned native endian writes: writeXXne(ptr, num)
  14 ///   - Aligned native endian reads: aligned_readXXne(ptr)
  15 ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
  16 ///
  17 /// Endianness-converting integer operations (these can be macros!)
  18 /// (XX = 16, 32, or 64; Y = b or l):
  19 ///   - Byte swapping: byteswapXX(num)
  20 ///   - Byte order conversions to/from native (byteswaps if Y isn't
  21 ///     the native endianness): convXXYe(num)
  22 ///   - Unaligned reads: readXXYe(ptr)
  23 ///   - Unaligned writes: writeXXYe(ptr, num)
  24 ///   - Aligned reads: aligned_readXXYe(ptr)
  25 ///   - Aligned writes: aligned_writeXXYe(ptr, num)
  26 ///
  27 /// Since the above can macros, the arguments should have no side effects
  28 /// because they may be evaluated more than once.
  29 ///
  30 /// Bit scan operations for non-zero 32-bit integers (inline functions):
  31 ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
  32 ///   - Count leading zeros: clz32(num)
  33 ///   - Count trailing zeros: ctz32(num)
  34 ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
  35 ///
  36 /// The above bit scan operations return 0-31. If num is zero,
  37 /// the result is undefined.
  38 //
  39 //  Authors:    Lasse Collin
  40 //              Joachim Henke
  41 //
  42 ///////////////////////////////////////////////////////////////////////////////
  43
  44 #ifndef TUKLIB_INTEGER_H
  45 #define TUKLIB_INTEGER_H
  46
  47 #include "tuklib_common.h"
  48 #include <string.h>
  49
  50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
  51 // and such functions.
  52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
  53 #       include <immintrin.h>
  54 // Only include <intrin.h> when it is needed. GCC and Clang can both
  55 // use __builtin's, so we only need Windows instrincs when using MSVC.
  56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
  57 // cases explicitly.
  58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
  59 #       include <intrin.h>
  60 #endif
  61
  62
  63 ///////////////////
  64 // Byte swapping //
  65 ///////////////////
  66
  67 #if defined(HAVE___BUILTIN_BSWAPXX)
  68         // GCC >= 4.8 and Clang
  69 #       define byteswap16(num) __builtin_bswap16(num)
  70 #       define byteswap32(num) __builtin_bswap32(num)
  71 #       define byteswap64(num) __builtin_bswap64(num)
  72
  73 #elif defined(HAVE_BYTESWAP_H)
  74         // glibc, uClibc, dietlibc
  75 #       include <byteswap.h>
  76 #       ifdef HAVE_BSWAP_16
  77 #               define byteswap16(num) bswap_16(num)
  78 #       endif
  79 #       ifdef HAVE_BSWAP_32
  80 #               define byteswap32(num) bswap_32(num)
  81 #       endif
  82 #       ifdef HAVE_BSWAP_64
  83 #               define byteswap64(num) bswap_64(num)
  84 #       endif
  85
  86 #elif defined(HAVE_SYS_ENDIAN_H)
  87         // *BSDs and Darwin
  88 #       include <sys/endian.h>
  89 #       ifdef __OpenBSD__
  90 #               define byteswap16(num) swap16(num)
  91 #               define byteswap32(num) swap32(num)
  92 #               define byteswap64(num) swap64(num)
  93 #       else
  94 #               define byteswap16(num) bswap16(num)
  95 #               define byteswap32(num) bswap32(num)
  96 #               define byteswap64(num) bswap64(num)
  97 #       endif
  98
  99 #elif defined(HAVE_SYS_BYTEORDER_H)
 100         // Solaris
 101 #       include <sys/byteorder.h>
 102 #       ifdef BSWAP_16
 103 #               define byteswap16(num) BSWAP_16(num)
 104 #       endif
 105 #       ifdef BSWAP_32
 106 #               define byteswap32(num) BSWAP_32(num)
 107 #       endif
 108 #       ifdef BSWAP_64
 109 #               define byteswap64(num) BSWAP_64(num)
 110 #       endif
 111 #       ifdef BE_16
 112 #               define conv16be(num) BE_16(num)
 113 #       endif
 114 #       ifdef BE_32
 115 #               define conv32be(num) BE_32(num)
 116 #       endif
 117 #       ifdef BE_64
 118 #               define conv64be(num) BE_64(num)
 119 #       endif
 120 #       ifdef LE_16
 121 #               define conv16le(num) LE_16(num)
 122 #       endif
 123 #       ifdef LE_32
 124 #               define conv32le(num) LE_32(num)
 125 #       endif
 126 #       ifdef LE_64
 127 #               define conv64le(num) LE_64(num)
 128 #       endif
 129 #endif
 130
 131 #ifndef byteswap16
 132 #       define byteswap16(n) (uint16_t)( \
 133                   (((n) & 0x00FFU) << 8) \
 134                 | (((n) & 0xFF00U) >> 8) \
 135         )
 136 #endif
 137
 138 #ifndef byteswap32
 139 #       define byteswap32(n) (uint32_t)( \
 140                   (((n) & UINT32_C(0x000000FF)) << 24) \
 141                 | (((n) & UINT32_C(0x0000FF00)) << 8) \
 142                 | (((n) & UINT32_C(0x00FF0000)) >> 8) \
 143                 | (((n) & UINT32_C(0xFF000000)) >> 24) \
 144         )
 145 #endif
 146
 147 #ifndef byteswap64
 148 #       define byteswap64(n) (uint64_t)( \
 149                   (((n) & UINT64_C(0x00000000000000FF)) << 56) \
 150                 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \
 151                 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
 152                 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \
 153                 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
 154                 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
 155                 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
 156                 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
 157         )
 158 #endif
 159
 160 // Define conversion macros using the basic byte swapping macros.
 161 #ifdef WORDS_BIGENDIAN
 162 #       ifndef conv16be
 163 #               define conv16be(num) ((uint16_t)(num))
 164 #       endif
 165 #       ifndef conv32be
 166 #               define conv32be(num) ((uint32_t)(num))
 167 #       endif
 168 #       ifndef conv64be
 169 #               define conv64be(num) ((uint64_t)(num))
 170 #       endif
 171 #       ifndef conv16le
 172 #               define conv16le(num) byteswap16(num)
 173 #       endif
 174 #       ifndef conv32le
 175 #               define conv32le(num) byteswap32(num)
 176 #       endif
 177 #       ifndef conv64le
 178 #               define conv64le(num) byteswap64(num)
 179 #       endif
 180 #else
 181 #       ifndef conv16be
 182 #               define conv16be(num) byteswap16(num)
 183 #       endif
 184 #       ifndef conv32be
 185 #               define conv32be(num) byteswap32(num)
 186 #       endif
 187 #       ifndef conv64be
 188 #               define conv64be(num) byteswap64(num)
 189 #       endif
 190 #       ifndef conv16le
 191 #               define conv16le(num) ((uint16_t)(num))
 192 #       endif
 193 #       ifndef conv32le
 194 #               define conv32le(num) ((uint32_t)(num))
 195 #       endif
 196 #       ifndef conv64le
 197 #               define conv64le(num) ((uint64_t)(num))
 198 #       endif
 199 #endif
 200
 201
 202 ////////////////////////////////
 203 // Unaligned reads and writes //
 204 ////////////////////////////////
 205
 206 // No-strict-align archs like x86-64
 207 // ---------------------------------
 208 //
 209 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
 210 // is bad even if the uint8_pointer is properly aligned because this kind
 211 // of casts break strict aliasing rules and result in undefined behavior.
 212 // With unaligned pointers it's even worse: compilers may emit vector
 213 // instructions that require aligned pointers even if non-vector
 214 // instructions work with unaligned pointers.
 215 //
 216 // Using memcpy() is the standard compliant way to do unaligned access.
 217 // Many modern compilers inline it so there is no function call overhead.
 218 // For those compilers that don't handle the memcpy() method well, the
 219 // old casting method (that violates strict aliasing) can be requested at
 220 // build time. A third method, casting to a packed struct, would also be
 221 // an option but isn't provided to keep things simpler (it's already a mess).
 222 // Hopefully this is flexible enough in practice.
 223 //
 224 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
 225 //
 226 //     buf[0] | (buf[1] << 8)
 227 //
 228 // reads a 16-bit value and can emit a single 16-bit load and produce
 229 // identical code than with the memcpy() method. In other cases Clang and GCC
 230 // produce either the same or better code with memcpy(). For example, Clang 9
 231 // on x86-64 can detect 32-bit load but not 16-bit load.
 232 //
 233 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
 234 // code for "buf[0] | (buf[1] << 8)".
 235 //
 236 // Conclusion: The memcpy() method is the best choice when unaligned access
 237 // is supported.
 238 //
 239 // Strict-align archs like SPARC
 240 // -----------------------------
 241 //
 242 // GCC versions from around 4.x to to at least 13.2.0 produce worse code
 243 // from the memcpy() method than from simple byte-by-byte shift-or code
 244 // when reading a 32-bit integer:
 245 //
 246 //     (1) It may be constructed on stack using four 8-bit loads,
 247 //         four 8-bit stores to stack, and finally one 32-bit load from stack.
 248 //
 249 //     (2) Especially with -Os, an actual memcpy() call may be emitted.
 250 //
 251 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
 252 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
 253 // some processors but not all so this is relevant only in the case when
 254 // GCC assumes that unaligned is not supported or -mstrict-align or
 255 // -mno-unaligned-access is used.
 256 //
 257 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
 258 // was one the very few with a minor difference: the memcpy() version
 259 // was one instruction longer.
 260 //
 261 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
 262 // the best choice for strict-align archs to do unaligned access.
 263 //
 264 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
 265 //
 266 // Thanks to <https://godbolt.org/> it was easy to test different compilers.
 267 // The following is for little endian targets:
 268 /*
 269 #include <stdint.h>
 270 #include <string.h>
 271
 272 uint32_t bytes16(const uint8_t *b)
 273 {
 274     return (uint32_t)b[0]
 275         | ((uint32_t)b[1] << 8);
 276 }
 277
 278 uint32_t copy16(const uint8_t *b)
 279 {
 280     uint16_t v;
 281     memcpy(&v, b, sizeof(v));
 282     return v;
 283 }
 284
 285 uint32_t bytes32(const uint8_t *b)
 286 {
 287     return (uint32_t)b[0]
 288         | ((uint32_t)b[1] << 8)
 289         | ((uint32_t)b[2] << 16)
 290         | ((uint32_t)b[3] << 24);
 291 }
 292
 293 uint32_t copy32(const uint8_t *b)
 294 {
 295     uint32_t v;
 296     memcpy(&v, b, sizeof(v));
 297     return v;
 298 }
 299
 300 void wbytes16(uint8_t *b, uint16_t v)
 301 {
 302     b[0] = (uint8_t)v;
 303     b[1] = (uint8_t)(v >> 8);
 304 }
 305
 306 void wcopy16(uint8_t *b, uint16_t v)
 307 {
 308     memcpy(b, &v, sizeof(v));
 309 }
 310
 311 void wbytes32(uint8_t *b, uint32_t v)
 312 {
 313     b[0] = (uint8_t)v;
 314     b[1] = (uint8_t)(v >> 8);
 315     b[2] = (uint8_t)(v >> 16);
 316     b[3] = (uint8_t)(v >> 24);
 317 }
 318
 319 void wcopy32(uint8_t *b, uint32_t v)
 320 {
 321     memcpy(b, &v, sizeof(v));
 322 }
 323 */
 324
 325
 326 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
 327
 328 static inline uint16_t
 329 read16ne(const uint8_t *buf)
 330 {
 331 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 332         return *(const uint16_t *)buf;
 333 #else
 334         uint16_t num;
 335         memcpy(&num, buf, sizeof(num));
 336         return num;
 337 #endif
 338 }
 339
 340
 341 static inline uint32_t
 342 read32ne(const uint8_t *buf)
 343 {
 344 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 345         return *(const uint32_t *)buf;
 346 #else
 347         uint32_t num;
 348         memcpy(&num, buf, sizeof(num));
 349         return num;
 350 #endif
 351 }
 352
 353
 354 static inline uint64_t
 355 read64ne(const uint8_t *buf)
 356 {
 357 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 358         return *(const uint64_t *)buf;
 359 #else
 360         uint64_t num;
 361         memcpy(&num, buf, sizeof(num));
 362         return num;
 363 #endif
 364 }
 365
 366
 367 static inline void
 368 write16ne(uint8_t *buf, uint16_t num)
 369 {
 370 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 371         *(uint16_t *)buf = num;
 372 #else
 373         memcpy(buf, &num, sizeof(num));
 374 #endif
 375         return;
 376 }
 377
 378
 379 static inline void
 380 write32ne(uint8_t *buf, uint32_t num)
 381 {
 382 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 383         *(uint32_t *)buf = num;
 384 #else
 385         memcpy(buf, &num, sizeof(num));
 386 #endif
 387         return;
 388 }
 389
 390
 391 static inline void
 392 write64ne(uint8_t *buf, uint64_t num)
 393 {
 394 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 395         *(uint64_t *)buf = num;
 396 #else
 397         memcpy(buf, &num, sizeof(num));
 398 #endif
 399         return;
 400 }
 401
 402
 403 static inline uint16_t
 404 read16be(const uint8_t *buf)
 405 {
 406         uint16_t num = read16ne(buf);
 407         return conv16be(num);
 408 }
 409
 410
 411 static inline uint16_t
 412 read16le(const uint8_t *buf)
 413 {
 414         uint16_t num = read16ne(buf);
 415         return conv16le(num);
 416 }
 417
 418
 419 static inline uint32_t
 420 read32be(const uint8_t *buf)
 421 {
 422         uint32_t num = read32ne(buf);
 423         return conv32be(num);
 424 }
 425
 426
 427 static inline uint32_t
 428 read32le(const uint8_t *buf)
 429 {
 430         uint32_t num = read32ne(buf);
 431         return conv32le(num);
 432 }
 433
 434
 435 static inline uint64_t
 436 read64be(const uint8_t *buf)
 437 {
 438         uint64_t num = read64ne(buf);
 439         return conv64be(num);
 440 }
 441
 442
 443 static inline uint64_t
 444 read64le(const uint8_t *buf)
 445 {
 446         uint64_t num = read64ne(buf);
 447         return conv64le(num);
 448 }
 449
 450
 451 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
 452 // to optimize byte swapping of constants when using glibc's or *BSD's
 453 // byte swapping macros. The actual write is done in an inline function
 454 // to make type checking of the buf pointer possible.
 455 #define write16be(buf, num) write16ne(buf, conv16be(num))
 456 #define write32be(buf, num) write32ne(buf, conv32be(num))
 457 #define write64be(buf, num) write64ne(buf, conv64be(num))
 458 #define write16le(buf, num) write16ne(buf, conv16le(num))
 459 #define write32le(buf, num) write32ne(buf, conv32le(num))
 460 #define write64le(buf, num) write64ne(buf, conv64le(num))
 461
 462 #else
 463
 464 #ifdef WORDS_BIGENDIAN
 465 #       define read16ne read16be
 466 #       define read32ne read32be
 467 #       define read64ne read64be
 468 #       define write16ne write16be
 469 #       define write32ne write32be
 470 #       define write64ne write64be
 471 #else
 472 #       define read16ne read16le
 473 #       define read32ne read32le
 474 #       define read64ne read64le
 475 #       define write16ne write16le
 476 #       define write32ne write32le
 477 #       define write64ne write64le
 478 #endif
 479
 480
 481 static inline uint16_t
 482 read16be(const uint8_t *buf)
 483 {
 484         uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
 485         return num;
 486 }
 487
 488
 489 static inline uint16_t
 490 read16le(const uint8_t *buf)
 491 {
 492         uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
 493         return num;
 494 }
 495
 496
 497 static inline uint32_t
 498 read32be(const uint8_t *buf)
 499 {
 500         uint32_t num = (uint32_t)buf[0] << 24;
 501         num |= (uint32_t)buf[1] << 16;
 502         num |= (uint32_t)buf[2] << 8;
 503         num |= (uint32_t)buf[3];
 504         return num;
 505 }
 506
 507
 508 static inline uint32_t
 509 read32le(const uint8_t *buf)
 510 {
 511         uint32_t num = (uint32_t)buf[0];
 512         num |= (uint32_t)buf[1] << 8;
 513         num |= (uint32_t)buf[2] << 16;
 514         num |= (uint32_t)buf[3] << 24;
 515         return num;
 516 }
 517
 518
 519 static inline uint64_t
 520 read64be(const uint8_t *buf)
 521 {
 522         uint64_t num = (uint64_t)buf[0] << 56;
 523         num |= (uint64_t)buf[1] << 48;
 524         num |= (uint64_t)buf[2] << 40;
 525         num |= (uint64_t)buf[3] << 32;
 526         num |= (uint64_t)buf[4] << 24;
 527         num |= (uint64_t)buf[5] << 16;
 528         num |= (uint64_t)buf[6] << 8;
 529         num |= (uint64_t)buf[7];
 530         return num;
 531 }
 532
 533
 534 static inline uint64_t
 535 read64le(const uint8_t *buf)
 536 {
 537         uint64_t num = (uint64_t)buf[0];
 538         num |= (uint64_t)buf[1] << 8;
 539         num |= (uint64_t)buf[2] << 16;
 540         num |= (uint64_t)buf[3] << 24;
 541         num |= (uint64_t)buf[4] << 32;
 542         num |= (uint64_t)buf[5] << 40;
 543         num |= (uint64_t)buf[6] << 48;
 544         num |= (uint64_t)buf[7] << 56;
 545         return num;
 546 }
 547
 548
 549 static inline void
 550 write16be(uint8_t *buf, uint16_t num)
 551 {
 552         buf[0] = (uint8_t)(num >> 8);
 553         buf[1] = (uint8_t)num;
 554         return;
 555 }
 556
 557
 558 static inline void
 559 write16le(uint8_t *buf, uint16_t num)
 560 {
 561         buf[0] = (uint8_t)num;
 562         buf[1] = (uint8_t)(num >> 8);
 563         return;
 564 }
 565
 566
 567 static inline void
 568 write32be(uint8_t *buf, uint32_t num)
 569 {
 570         buf[0] = (uint8_t)(num >> 24);
 571         buf[1] = (uint8_t)(num >> 16);
 572         buf[2] = (uint8_t)(num >> 8);
 573         buf[3] = (uint8_t)num;
 574         return;
 575 }
 576
 577
 578 static inline void
 579 write32le(uint8_t *buf, uint32_t num)
 580 {
 581         buf[0] = (uint8_t)num;
 582         buf[1] = (uint8_t)(num >> 8);
 583         buf[2] = (uint8_t)(num >> 16);
 584         buf[3] = (uint8_t)(num >> 24);
 585         return;
 586 }
 587
 588
 589 static inline void
 590 write64be(uint8_t *buf, uint64_t num)
 591 {
 592         buf[0] = (uint8_t)(num >> 56);
 593         buf[1] = (uint8_t)(num >> 48);
 594         buf[2] = (uint8_t)(num >> 40);
 595         buf[3] = (uint8_t)(num >> 32);
 596         buf[4] = (uint8_t)(num >> 24);
 597         buf[5] = (uint8_t)(num >> 16);
 598         buf[6] = (uint8_t)(num >> 8);
 599         buf[7] = (uint8_t)num;
 600         return;
 601 }
 602
 603
 604 static inline void
 605 write64le(uint8_t *buf, uint64_t num)
 606 {
 607         buf[0] = (uint8_t)num;
 608         buf[1] = (uint8_t)(num >> 8);
 609         buf[2] = (uint8_t)(num >> 16);
 610         buf[3] = (uint8_t)(num >> 24);
 611         buf[4] = (uint8_t)(num >> 32);
 612         buf[5] = (uint8_t)(num >> 40);
 613         buf[6] = (uint8_t)(num >> 48);
 614         buf[7] = (uint8_t)(num >> 56);
 615         return;
 616 }
 617
 618 #endif
 619
 620
 621 //////////////////////////////
 622 // Aligned reads and writes //
 623 //////////////////////////////
 624
 625 // Separate functions for aligned reads and writes are provided since on
 626 // strict-align archs aligned access is much faster than unaligned access.
 627 //
 628 // Just like in the unaligned case, memcpy() is needed to avoid
 629 // strict aliasing violations. However, on archs that don't support
 630 // unaligned access the compiler cannot know that the pointers given
 631 // to memcpy() are aligned which results in slow code. As of C11 there is
 632 // no standard way to tell the compiler that we know that the address is
 633 // aligned but some compilers have language extensions to do that. With
 634 // such language extensions the memcpy() method gives excellent results.
 635 //
 636 // What to do on a strict-align system when no known language extensions
 637 // are available? Falling back to byte-by-byte access would be safe but ruin
 638 // optimizations that have been made specifically with aligned access in mind.
 639 // As a compromise, aligned reads will fall back to non-compliant type punning
 640 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
 641 // over fast writes. This obviously isn't great but hopefully it's a working
 642 // compromise for now.
 643 //
 644 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
 645 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
 646 #       define tuklib_memcpy_aligned(dest, src, size) \
 647                 memcpy(dest, __builtin_assume_aligned(src, size), size)
 648 #else
 649 #       define tuklib_memcpy_aligned(dest, src, size) \
 650                 memcpy(dest, src, size)
 651 #       ifndef TUKLIB_FAST_UNALIGNED_ACCESS
 652 #               define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
 653 #       endif
 654 #endif
 655
 656
 657 static inline uint16_t
 658 aligned_read16ne(const uint8_t *buf)
 659 {
 660 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
 661                 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
 662         return *(const uint16_t *)buf;
 663 #else
 664         uint16_t num;
 665         tuklib_memcpy_aligned(&num, buf, sizeof(num));
 666         return num;
 667 #endif
 668 }
 669
 670
 671 static inline uint32_t
 672 aligned_read32ne(const uint8_t *buf)
 673 {
 674 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
 675                 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
 676         return *(const uint32_t *)buf;
 677 #else
 678         uint32_t num;
 679         tuklib_memcpy_aligned(&num, buf, sizeof(num));
 680         return num;
 681 #endif
 682 }
 683
 684
 685 static inline uint64_t
 686 aligned_read64ne(const uint8_t *buf)
 687 {
 688 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
 689                 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
 690         return *(const uint64_t *)buf;
 691 #else
 692         uint64_t num;
 693         tuklib_memcpy_aligned(&num, buf, sizeof(num));
 694         return num;
 695 #endif
 696 }
 697
 698
 699 static inline void
 700 aligned_write16ne(uint8_t *buf, uint16_t num)
 701 {
 702 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 703         *(uint16_t *)buf = num;
 704 #else
 705         tuklib_memcpy_aligned(buf, &num, sizeof(num));
 706 #endif
 707         return;
 708 }
 709
 710
 711 static inline void
 712 aligned_write32ne(uint8_t *buf, uint32_t num)
 713 {
 714 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 715         *(uint32_t *)buf = num;
 716 #else
 717         tuklib_memcpy_aligned(buf, &num, sizeof(num));
 718 #endif
 719         return;
 720 }
 721
 722
 723 static inline void
 724 aligned_write64ne(uint8_t *buf, uint64_t num)
 725 {
 726 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 727         *(uint64_t *)buf = num;
 728 #else
 729         tuklib_memcpy_aligned(buf, &num, sizeof(num));
 730 #endif
 731         return;
 732 }
 733
 734
 735 static inline uint16_t
 736 aligned_read16be(const uint8_t *buf)
 737 {
 738         uint16_t num = aligned_read16ne(buf);
 739         return conv16be(num);
 740 }
 741
 742
 743 static inline uint16_t
 744 aligned_read16le(const uint8_t *buf)
 745 {
 746         uint16_t num = aligned_read16ne(buf);
 747         return conv16le(num);
 748 }
 749
 750
 751 static inline uint32_t
 752 aligned_read32be(const uint8_t *buf)
 753 {
 754         uint32_t num = aligned_read32ne(buf);
 755         return conv32be(num);
 756 }
 757
 758
 759 static inline uint32_t
 760 aligned_read32le(const uint8_t *buf)
 761 {
 762         uint32_t num = aligned_read32ne(buf);
 763         return conv32le(num);
 764 }
 765
 766
 767 static inline uint64_t
 768 aligned_read64be(const uint8_t *buf)
 769 {
 770         uint64_t num = aligned_read64ne(buf);
 771         return conv64be(num);
 772 }
 773
 774
 775 static inline uint64_t
 776 aligned_read64le(const uint8_t *buf)
 777 {
 778         uint64_t num = aligned_read64ne(buf);
 779         return conv64le(num);
 780 }
 781
 782
 783 // These need to be macros like in the unaligned case.
 784 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
 785 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
 786 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
 787 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
 788 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
 789 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
 790
 791
 792 ////////////////////
 793 // Bit operations //
 794 ////////////////////
 795
 796 static inline uint32_t
 797 bsr32(uint32_t n)
 798 {
 799         // Check for ICC first, since it tends to define __GNUC__ too.
 800 #if defined(__INTEL_COMPILER)
 801         return _bit_scan_reverse(n);
 802
 803 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
 804         // GCC >= 3.4 has __builtin_clz(), which gives good results on
 805         // multiple architectures. On x86, __builtin_clz() ^ 31U becomes
 806         // either plain BSR (so the XOR gets optimized away) or LZCNT and
 807         // XOR (if -march indicates that SSE4a instructions are supported).
 808         return (uint32_t)__builtin_clz(n) ^ 31U;
 809
 810 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 811         uint32_t i;
 812         __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
 813         return i;
 814
 815 #elif defined(_MSC_VER)
 816         unsigned long i;
 817         _BitScanReverse(&i, n);
 818         return i;
 819
 820 #else
 821         uint32_t i = 31;
 822
 823         if ((n & 0xFFFF0000) == 0) {
 824                 n <<= 16;
 825                 i = 15;
 826         }
 827
 828         if ((n & 0xFF000000) == 0) {
 829                 n <<= 8;
 830                 i -= 8;
 831         }
 832
 833         if ((n & 0xF0000000) == 0) {
 834                 n <<= 4;
 835                 i -= 4;
 836         }
 837
 838         if ((n & 0xC0000000) == 0) {
 839                 n <<= 2;
 840                 i -= 2;
 841         }
 842
 843         if ((n & 0x80000000) == 0)
 844                 --i;
 845
 846         return i;
 847 #endif
 848 }
 849
 850
 851 static inline uint32_t
 852 clz32(uint32_t n)
 853 {
 854 #if defined(__INTEL_COMPILER)
 855         return _bit_scan_reverse(n) ^ 31U;
 856
 857 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
 858         return (uint32_t)__builtin_clz(n);
 859
 860 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 861         uint32_t i;
 862         __asm__("bsrl %1, %0\n\t"
 863                 "xorl $31, %0"
 864                 : "=r" (i) : "rm" (n));
 865         return i;
 866
 867 #elif defined(_MSC_VER)
 868         unsigned long i;
 869         _BitScanReverse(&i, n);
 870         return i ^ 31U;
 871
 872 #else
 873         uint32_t i = 0;
 874
 875         if ((n & 0xFFFF0000) == 0) {
 876                 n <<= 16;
 877                 i = 16;
 878         }
 879
 880         if ((n & 0xFF000000) == 0) {
 881                 n <<= 8;
 882                 i += 8;
 883         }
 884
 885         if ((n & 0xF0000000) == 0) {
 886                 n <<= 4;
 887                 i += 4;
 888         }
 889
 890         if ((n & 0xC0000000) == 0) {
 891                 n <<= 2;
 892                 i += 2;
 893         }
 894
 895         if ((n & 0x80000000) == 0)
 896                 ++i;
 897
 898         return i;
 899 #endif
 900 }
 901
 902
 903 static inline uint32_t
 904 ctz32(uint32_t n)
 905 {
 906 #if defined(__INTEL_COMPILER)
 907         return _bit_scan_forward(n);
 908
 909 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
 910         return (uint32_t)__builtin_ctz(n);
 911
 912 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 913         uint32_t i;
 914         __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
 915         return i;
 916
 917 #elif defined(_MSC_VER)
 918         unsigned long i;
 919         _BitScanForward(&i, n);
 920         return i;
 921
 922 #else
 923         uint32_t i = 0;
 924
 925         if ((n & 0x0000FFFF) == 0) {
 926                 n >>= 16;
 927                 i = 16;
 928         }
 929
 930         if ((n & 0x000000FF) == 0) {
 931                 n >>= 8;
 932                 i += 8;
 933         }
 934
 935         if ((n & 0x0000000F) == 0) {
 936                 n >>= 4;
 937                 i += 4;
 938         }
 939
 940         if ((n & 0x00000003) == 0) {
 941                 n >>= 2;
 942                 i += 2;
 943         }
 944
 945         if ((n & 0x00000001) == 0)
 946                 ++i;
 947
 948         return i;
 949 #endif
 950 }
 951
 952 #define bsf32 ctz32
 953
 954 #endif