src/common/tuklib_integer.h

   1 // SPDX-License-Identifier: 0BSD
   2
   3 ///////////////////////////////////////////////////////////////////////////////
   4 //
   5 /// \file       tuklib_integer.h
   6 /// \brief      Various integer and bit operations
   7 ///
   8 /// This file provides macros or functions to do some basic integer and bit
   9 /// operations.
  10 ///
  11 /// Native endian inline functions (XX = 16, 32, or 64):
  12 ///   - Unaligned native endian reads: readXXne(ptr)
  13 ///   - Unaligned native endian writes: writeXXne(ptr, num)
  14 ///   - Aligned native endian reads: aligned_readXXne(ptr)
  15 ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
  16 ///
  17 /// Endianness-converting integer operations (these can be macros!)
  18 /// (XX = 16, 32, or 64; Y = b or l):
  19 ///   - Byte swapping: byteswapXX(num)
  20 ///   - Byte order conversions to/from native (byteswaps if Y isn't
  21 ///     the native endianness): convXXYe(num)
  22 ///   - Unaligned reads: readXXYe(ptr)
  23 ///   - Unaligned writes: writeXXYe(ptr, num)
  24 ///   - Aligned reads: aligned_readXXYe(ptr)
  25 ///   - Aligned writes: aligned_writeXXYe(ptr, num)
  26 ///
  27 /// Since the above can macros, the arguments should have no side effects
  28 /// because they may be evaluated more than once.
  29 ///
  30 /// Bit scan operations for non-zero 32-bit integers (inline functions):
  31 ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
  32 ///   - Count leading zeros: clz32(num)
  33 ///   - Count trailing zeros: ctz32(num)
  34 ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
  35 ///
  36 /// The above bit scan operations return 0-31. If num is zero,
  37 /// the result is undefined.
  38 //
  39 //  Authors:    Lasse Collin
  40 //              Joachim Henke
  41 //
  42 ///////////////////////////////////////////////////////////////////////////////
  43
  44 #ifndef TUKLIB_INTEGER_H
  45 #define TUKLIB_INTEGER_H
  46
  47 #include "tuklib_common.h"
  48 #include <string.h>
  49
  50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
  51 // and such functions.
  52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
  53 #       include <immintrin.h>
  54 // Only include <intrin.h> when it is needed. GCC and Clang can both
  55 // use __builtin's, so we only need Windows instrincs when using MSVC.
  56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
  57 // cases explicitly.
  58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
  59 #       include <intrin.h>
  60 #endif
  61
  62
  63 ///////////////////
  64 // Byte swapping //
  65 ///////////////////
  66
  67 #if defined(HAVE___BUILTIN_BSWAPXX)
  68         // GCC >= 4.8 and Clang
  69 #       define byteswap16(num) __builtin_bswap16(num)
  70 #       define byteswap32(num) __builtin_bswap32(num)
  71 #       define byteswap64(num) __builtin_bswap64(num)
  72
  73 #elif defined(HAVE_BYTESWAP_H)
  74         // glibc, uClibc, dietlibc
  75 #       include <byteswap.h>
  76 #       ifdef HAVE_BSWAP_16
  77 #               define byteswap16(num) bswap_16(num)
  78 #       endif
  79 #       ifdef HAVE_BSWAP_32
  80 #               define byteswap32(num) bswap_32(num)
  81 #       endif
  82 #       ifdef HAVE_BSWAP_64
  83 #               define byteswap64(num) bswap_64(num)
  84 #       endif
  85
  86 #elif defined(HAVE_SYS_ENDIAN_H)
  87         // *BSDs and Darwin
  88 #       include <sys/endian.h>
  89 #       define byteswap16(num) bswap16(num)
  90 #       define byteswap32(num) bswap32(num)
  91 #       define byteswap64(num) bswap64(num)
  92
  93 #elif defined(HAVE_SYS_BYTEORDER_H)
  94         // Solaris
  95 #       include <sys/byteorder.h>
  96 #       ifdef BSWAP_16
  97 #               define byteswap16(num) BSWAP_16(num)
  98 #       endif
  99 #       ifdef BSWAP_32
 100 #               define byteswap32(num) BSWAP_32(num)
 101 #       endif
 102 #       ifdef BSWAP_64
 103 #               define byteswap64(num) BSWAP_64(num)
 104 #       endif
 105 #       ifdef BE_16
 106 #               define conv16be(num) BE_16(num)
 107 #       endif
 108 #       ifdef BE_32
 109 #               define conv32be(num) BE_32(num)
 110 #       endif
 111 #       ifdef BE_64
 112 #               define conv64be(num) BE_64(num)
 113 #       endif
 114 #       ifdef LE_16
 115 #               define conv16le(num) LE_16(num)
 116 #       endif
 117 #       ifdef LE_32
 118 #               define conv32le(num) LE_32(num)
 119 #       endif
 120 #       ifdef LE_64
 121 #               define conv64le(num) LE_64(num)
 122 #       endif
 123 #endif
 124
 125 #ifndef byteswap16
 126 #       define byteswap16(n) (uint16_t)( \
 127                   (((n) & 0x00FFU) << 8) \
 128                 | (((n) & 0xFF00U) >> 8) \
 129         )
 130 #endif
 131
 132 #ifndef byteswap32
 133 #       define byteswap32(n) (uint32_t)( \
 134                   (((n) & UINT32_C(0x000000FF)) << 24) \
 135                 | (((n) & UINT32_C(0x0000FF00)) << 8) \
 136                 | (((n) & UINT32_C(0x00FF0000)) >> 8) \
 137                 | (((n) & UINT32_C(0xFF000000)) >> 24) \
 138         )
 139 #endif
 140
 141 #ifndef byteswap64
 142 #       define byteswap64(n) (uint64_t)( \
 143                   (((n) & UINT64_C(0x00000000000000FF)) << 56) \
 144                 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \
 145                 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
 146                 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \
 147                 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
 148                 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
 149                 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
 150                 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
 151         )
 152 #endif
 153
 154 // Define conversion macros using the basic byte swapping macros.
 155 #ifdef WORDS_BIGENDIAN
 156 #       ifndef conv16be
 157 #               define conv16be(num) ((uint16_t)(num))
 158 #       endif
 159 #       ifndef conv32be
 160 #               define conv32be(num) ((uint32_t)(num))
 161 #       endif
 162 #       ifndef conv64be
 163 #               define conv64be(num) ((uint64_t)(num))
 164 #       endif
 165 #       ifndef conv16le
 166 #               define conv16le(num) byteswap16(num)
 167 #       endif
 168 #       ifndef conv32le
 169 #               define conv32le(num) byteswap32(num)
 170 #       endif
 171 #       ifndef conv64le
 172 #               define conv64le(num) byteswap64(num)
 173 #       endif
 174 #else
 175 #       ifndef conv16be
 176 #               define conv16be(num) byteswap16(num)
 177 #       endif
 178 #       ifndef conv32be
 179 #               define conv32be(num) byteswap32(num)
 180 #       endif
 181 #       ifndef conv64be
 182 #               define conv64be(num) byteswap64(num)
 183 #       endif
 184 #       ifndef conv16le
 185 #               define conv16le(num) ((uint16_t)(num))
 186 #       endif
 187 #       ifndef conv32le
 188 #               define conv32le(num) ((uint32_t)(num))
 189 #       endif
 190 #       ifndef conv64le
 191 #               define conv64le(num) ((uint64_t)(num))
 192 #       endif
 193 #endif
 194
 195
 196 ////////////////////////////////
 197 // Unaligned reads and writes //
 198 ////////////////////////////////
 199
 200 // No-strict-align archs like x86-64
 201 // ---------------------------------
 202 //
 203 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
 204 // is bad even if the uint8_pointer is properly aligned because this kind
 205 // of casts break strict aliasing rules and result in undefined behavior.
 206 // With unaligned pointers it's even worse: compilers may emit vector
 207 // instructions that require aligned pointers even if non-vector
 208 // instructions work with unaligned pointers.
 209 //
 210 // Using memcpy() is the standard compliant way to do unaligned access.
 211 // Many modern compilers inline it so there is no function call overhead.
 212 // For those compilers that don't handle the memcpy() method well, the
 213 // old casting method (that violates strict aliasing) can be requested at
 214 // build time. A third method, casting to a packed struct, would also be
 215 // an option but isn't provided to keep things simpler (it's already a mess).
 216 // Hopefully this is flexible enough in practice.
 217 //
 218 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
 219 //
 220 //     buf[0] | (buf[1] << 8)
 221 //
 222 // reads a 16-bit value and can emit a single 16-bit load and produce
 223 // identical code than with the memcpy() method. In other cases Clang and GCC
 224 // produce either the same or better code with memcpy(). For example, Clang 9
 225 // on x86-64 can detect 32-bit load but not 16-bit load.
 226 //
 227 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
 228 // code for "buf[0] | (buf[1] << 8)".
 229 //
 230 // Conclusion: The memcpy() method is the best choice when unaligned access
 231 // is supported.
 232 //
 233 // Strict-align archs like SPARC
 234 // -----------------------------
 235 //
 236 // GCC versions from around 4.x to to at least 13.2.0 produce worse code
 237 // from the memcpy() method than from simple byte-by-byte shift-or code
 238 // when reading a 32-bit integer:
 239 //
 240 //     (1) It may be constructed on stack using using four 8-bit loads,
 241 //         four 8-bit stores to stack, and finally one 32-bit load from stack.
 242 //
 243 //     (2) Especially with -Os, an actual memcpy() call may be emitted.
 244 //
 245 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
 246 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
 247 // some processors but not all so this is relevant only in the case when
 248 // GCC assumes that unaligned is not supported or -mstrict-align or
 249 // -mno-unaligned-access is used.
 250 //
 251 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
 252 // was one the very few with a minor difference: the memcpy() version
 253 // was one instruction longer.
 254 //
 255 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
 256 // the best choice for strict-align archs to do unaligned access.
 257 //
 258 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
 259 //
 260 // Thanks to <https://godbolt.org/> it was easy to test different compilers.
 261 // The following is for little endian targets:
 262 /*
 263 #include <stdint.h>
 264 #include <string.h>
 265
 266 uint32_t bytes16(const uint8_t *b)
 267 {
 268     return (uint32_t)b[0]
 269         | ((uint32_t)b[1] << 8);
 270 }
 271
 272 uint32_t copy16(const uint8_t *b)
 273 {
 274     uint16_t v;
 275     memcpy(&v, b, sizeof(v));
 276     return v;
 277 }
 278
 279 uint32_t bytes32(const uint8_t *b)
 280 {
 281     return (uint32_t)b[0]
 282         | ((uint32_t)b[1] << 8)
 283         | ((uint32_t)b[2] << 16)
 284         | ((uint32_t)b[3] << 24);
 285 }
 286
 287 uint32_t copy32(const uint8_t *b)
 288 {
 289     uint32_t v;
 290     memcpy(&v, b, sizeof(v));
 291     return v;
 292 }
 293
 294 void wbytes16(uint8_t *b, uint16_t v)
 295 {
 296     b[0] = (uint8_t)v;
 297     b[1] = (uint8_t)(v >> 8);
 298 }
 299
 300 void wcopy16(uint8_t *b, uint16_t v)
 301 {
 302     memcpy(b, &v, sizeof(v));
 303 }
 304
 305 void wbytes32(uint8_t *b, uint32_t v)
 306 {
 307     b[0] = (uint8_t)v;
 308     b[1] = (uint8_t)(v >> 8);
 309     b[2] = (uint8_t)(v >> 16);
 310     b[3] = (uint8_t)(v >> 24);
 311 }
 312
 313 void wcopy32(uint8_t *b, uint32_t v)
 314 {
 315     memcpy(b, &v, sizeof(v));
 316 }
 317 */
 318
 319
 320 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
 321
 322 static inline uint16_t
 323 read16ne(const uint8_t *buf)
 324 {
 325 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 326         return *(const uint16_t *)buf;
 327 #else
 328         uint16_t num;
 329         memcpy(&num, buf, sizeof(num));
 330         return num;
 331 #endif
 332 }
 333
 334
 335 static inline uint32_t
 336 read32ne(const uint8_t *buf)
 337 {
 338 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 339         return *(const uint32_t *)buf;
 340 #else
 341         uint32_t num;
 342         memcpy(&num, buf, sizeof(num));
 343         return num;
 344 #endif
 345 }
 346
 347
 348 static inline uint64_t
 349 read64ne(const uint8_t *buf)
 350 {
 351 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 352         return *(const uint64_t *)buf;
 353 #else
 354         uint64_t num;
 355         memcpy(&num, buf, sizeof(num));
 356         return num;
 357 #endif
 358 }
 359
 360
 361 static inline void
 362 write16ne(uint8_t *buf, uint16_t num)
 363 {
 364 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 365         *(uint16_t *)buf = num;
 366 #else
 367         memcpy(buf, &num, sizeof(num));
 368 #endif
 369         return;
 370 }
 371
 372
 373 static inline void
 374 write32ne(uint8_t *buf, uint32_t num)
 375 {
 376 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 377         *(uint32_t *)buf = num;
 378 #else
 379         memcpy(buf, &num, sizeof(num));
 380 #endif
 381         return;
 382 }
 383
 384
 385 static inline void
 386 write64ne(uint8_t *buf, uint64_t num)
 387 {
 388 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 389         *(uint64_t *)buf = num;
 390 #else
 391         memcpy(buf, &num, sizeof(num));
 392 #endif
 393         return;
 394 }
 395
 396
 397 static inline uint16_t
 398 read16be(const uint8_t *buf)
 399 {
 400         uint16_t num = read16ne(buf);
 401         return conv16be(num);
 402 }
 403
 404
 405 static inline uint16_t
 406 read16le(const uint8_t *buf)
 407 {
 408         uint16_t num = read16ne(buf);
 409         return conv16le(num);
 410 }
 411
 412
 413 static inline uint32_t
 414 read32be(const uint8_t *buf)
 415 {
 416         uint32_t num = read32ne(buf);
 417         return conv32be(num);
 418 }
 419
 420
 421 static inline uint32_t
 422 read32le(const uint8_t *buf)
 423 {
 424         uint32_t num = read32ne(buf);
 425         return conv32le(num);
 426 }
 427
 428
 429 static inline uint64_t
 430 read64be(const uint8_t *buf)
 431 {
 432         uint64_t num = read64ne(buf);
 433         return conv64be(num);
 434 }
 435
 436
 437 static inline uint64_t
 438 read64le(const uint8_t *buf)
 439 {
 440         uint64_t num = read64ne(buf);
 441         return conv64le(num);
 442 }
 443
 444
 445 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
 446 // to optimize byte swapping of constants when using glibc's or *BSD's
 447 // byte swapping macros. The actual write is done in an inline function
 448 // to make type checking of the buf pointer possible.
 449 #define write16be(buf, num) write16ne(buf, conv16be(num))
 450 #define write32be(buf, num) write32ne(buf, conv32be(num))
 451 #define write64be(buf, num) write64ne(buf, conv64be(num))
 452 #define write16le(buf, num) write16ne(buf, conv16le(num))
 453 #define write32le(buf, num) write32ne(buf, conv32le(num))
 454 #define write64le(buf, num) write64ne(buf, conv64le(num))
 455
 456 #else
 457
 458 #ifdef WORDS_BIGENDIAN
 459 #       define read16ne read16be
 460 #       define read32ne read32be
 461 #       define read64ne read64be
 462 #       define write16ne write16be
 463 #       define write32ne write32be
 464 #       define write64ne write64be
 465 #else
 466 #       define read16ne read16le
 467 #       define read32ne read32le
 468 #       define read64ne read64le
 469 #       define write16ne write16le
 470 #       define write32ne write32le
 471 #       define write64ne write64le
 472 #endif
 473
 474
 475 static inline uint16_t
 476 read16be(const uint8_t *buf)
 477 {
 478         uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
 479         return num;
 480 }
 481
 482
 483 static inline uint16_t
 484 read16le(const uint8_t *buf)
 485 {
 486         uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
 487         return num;
 488 }
 489
 490
 491 static inline uint32_t
 492 read32be(const uint8_t *buf)
 493 {
 494         uint32_t num = (uint32_t)buf[0] << 24;
 495         num |= (uint32_t)buf[1] << 16;
 496         num |= (uint32_t)buf[2] << 8;
 497         num |= (uint32_t)buf[3];
 498         return num;
 499 }
 500
 501
 502 static inline uint32_t
 503 read32le(const uint8_t *buf)
 504 {
 505         uint32_t num = (uint32_t)buf[0];
 506         num |= (uint32_t)buf[1] << 8;
 507         num |= (uint32_t)buf[2] << 16;
 508         num |= (uint32_t)buf[3] << 24;
 509         return num;
 510 }
 511
 512
 513 static inline uint64_t
 514 read64be(const uint8_t *buf)
 515 {
 516         uint64_t num = (uint64_t)buf[0] << 56;
 517         num |= (uint64_t)buf[1] << 48;
 518         num |= (uint64_t)buf[2] << 40;
 519         num |= (uint64_t)buf[3] << 32;
 520         num |= (uint64_t)buf[4] << 24;
 521         num |= (uint64_t)buf[5] << 16;
 522         num |= (uint64_t)buf[6] << 8;
 523         num |= (uint64_t)buf[7];
 524         return num;
 525 }
 526
 527
 528 static inline uint64_t
 529 read64le(const uint8_t *buf)
 530 {
 531         uint64_t num = (uint64_t)buf[0];
 532         num |= (uint64_t)buf[1] << 8;
 533         num |= (uint64_t)buf[2] << 16;
 534         num |= (uint64_t)buf[3] << 24;
 535         num |= (uint64_t)buf[4] << 32;
 536         num |= (uint64_t)buf[5] << 40;
 537         num |= (uint64_t)buf[6] << 48;
 538         num |= (uint64_t)buf[7] << 56;
 539         return num;
 540 }
 541
 542
 543 static inline void
 544 write16be(uint8_t *buf, uint16_t num)
 545 {
 546         buf[0] = (uint8_t)(num >> 8);
 547         buf[1] = (uint8_t)num;
 548         return;
 549 }
 550
 551
 552 static inline void
 553 write16le(uint8_t *buf, uint16_t num)
 554 {
 555         buf[0] = (uint8_t)num;
 556         buf[1] = (uint8_t)(num >> 8);
 557         return;
 558 }
 559
 560
 561 static inline void
 562 write32be(uint8_t *buf, uint32_t num)
 563 {
 564         buf[0] = (uint8_t)(num >> 24);
 565         buf[1] = (uint8_t)(num >> 16);
 566         buf[2] = (uint8_t)(num >> 8);
 567         buf[3] = (uint8_t)num;
 568         return;
 569 }
 570
 571
 572 static inline void
 573 write32le(uint8_t *buf, uint32_t num)
 574 {
 575         buf[0] = (uint8_t)num;
 576         buf[1] = (uint8_t)(num >> 8);
 577         buf[2] = (uint8_t)(num >> 16);
 578         buf[3] = (uint8_t)(num >> 24);
 579         return;
 580 }
 581
 582
 583 static inline void
 584 write64be(uint8_t *buf, uint64_t num)
 585 {
 586         buf[0] = (uint8_t)(num >> 56);
 587         buf[1] = (uint8_t)(num >> 48);
 588         buf[2] = (uint8_t)(num >> 40);
 589         buf[3] = (uint8_t)(num >> 32);
 590         buf[4] = (uint8_t)(num >> 24);
 591         buf[5] = (uint8_t)(num >> 16);
 592         buf[6] = (uint8_t)(num >> 8);
 593         buf[7] = (uint8_t)num;
 594         return;
 595 }
 596
 597
 598 static inline void
 599 write64le(uint8_t *buf, uint64_t num)
 600 {
 601         buf[0] = (uint8_t)num;
 602         buf[1] = (uint8_t)(num >> 8);
 603         buf[2] = (uint8_t)(num >> 16);
 604         buf[3] = (uint8_t)(num >> 24);
 605         buf[4] = (uint8_t)(num >> 32);
 606         buf[5] = (uint8_t)(num >> 40);
 607         buf[6] = (uint8_t)(num >> 48);
 608         buf[7] = (uint8_t)(num >> 56);
 609         return;
 610 }
 611
 612 #endif
 613
 614
 615 //////////////////////////////
 616 // Aligned reads and writes //
 617 //////////////////////////////
 618
 619 // Separate functions for aligned reads and writes are provided since on
 620 // strict-align archs aligned access is much faster than unaligned access.
 621 //
 622 // Just like in the unaligned case, memcpy() is needed to avoid
 623 // strict aliasing violations. However, on archs that don't support
 624 // unaligned access the compiler cannot know that the pointers given
 625 // to memcpy() are aligned which results in slow code. As of C11 there is
 626 // no standard way to tell the compiler that we know that the address is
 627 // aligned but some compilers have language extensions to do that. With
 628 // such language extensions the memcpy() method gives excellent results.
 629 //
 630 // What to do on a strict-align system when no known language extensions
 631 // are available? Falling back to byte-by-byte access would be safe but ruin
 632 // optimizations that have been made specifically with aligned access in mind.
 633 // As a compromise, aligned reads will fall back to non-compliant type punning
 634 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
 635 // over fast writes. This obviously isn't great but hopefully it's a working
 636 // compromise for now.
 637 //
 638 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
 639 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
 640 #       define tuklib_memcpy_aligned(dest, src, size) \
 641                 memcpy(dest, __builtin_assume_aligned(src, size), size)
 642 #else
 643 #       define tuklib_memcpy_aligned(dest, src, size) \
 644                 memcpy(dest, src, size)
 645 #       ifndef TUKLIB_FAST_UNALIGNED_ACCESS
 646 #               define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
 647 #       endif
 648 #endif
 649
 650
 651 static inline uint16_t
 652 aligned_read16ne(const uint8_t *buf)
 653 {
 654 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
 655                 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
 656         return *(const uint16_t *)buf;
 657 #else
 658         uint16_t num;
 659         tuklib_memcpy_aligned(&num, buf, sizeof(num));
 660         return num;
 661 #endif
 662 }
 663
 664
 665 static inline uint32_t
 666 aligned_read32ne(const uint8_t *buf)
 667 {
 668 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
 669                 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
 670         return *(const uint32_t *)buf;
 671 #else
 672         uint32_t num;
 673         tuklib_memcpy_aligned(&num, buf, sizeof(num));
 674         return num;
 675 #endif
 676 }
 677
 678
 679 static inline uint64_t
 680 aligned_read64ne(const uint8_t *buf)
 681 {
 682 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
 683                 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
 684         return *(const uint64_t *)buf;
 685 #else
 686         uint64_t num;
 687         tuklib_memcpy_aligned(&num, buf, sizeof(num));
 688         return num;
 689 #endif
 690 }
 691
 692
 693 static inline void
 694 aligned_write16ne(uint8_t *buf, uint16_t num)
 695 {
 696 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 697         *(uint16_t *)buf = num;
 698 #else
 699         tuklib_memcpy_aligned(buf, &num, sizeof(num));
 700 #endif
 701         return;
 702 }
 703
 704
 705 static inline void
 706 aligned_write32ne(uint8_t *buf, uint32_t num)
 707 {
 708 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 709         *(uint32_t *)buf = num;
 710 #else
 711         tuklib_memcpy_aligned(buf, &num, sizeof(num));
 712 #endif
 713         return;
 714 }
 715
 716
 717 static inline void
 718 aligned_write64ne(uint8_t *buf, uint64_t num)
 719 {
 720 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 721         *(uint64_t *)buf = num;
 722 #else
 723         tuklib_memcpy_aligned(buf, &num, sizeof(num));
 724 #endif
 725         return;
 726 }
 727
 728
 729 static inline uint16_t
 730 aligned_read16be(const uint8_t *buf)
 731 {
 732         uint16_t num = aligned_read16ne(buf);
 733         return conv16be(num);
 734 }
 735
 736
 737 static inline uint16_t
 738 aligned_read16le(const uint8_t *buf)
 739 {
 740         uint16_t num = aligned_read16ne(buf);
 741         return conv16le(num);
 742 }
 743
 744
 745 static inline uint32_t
 746 aligned_read32be(const uint8_t *buf)
 747 {
 748         uint32_t num = aligned_read32ne(buf);
 749         return conv32be(num);
 750 }
 751
 752
 753 static inline uint32_t
 754 aligned_read32le(const uint8_t *buf)
 755 {
 756         uint32_t num = aligned_read32ne(buf);
 757         return conv32le(num);
 758 }
 759
 760
 761 static inline uint64_t
 762 aligned_read64be(const uint8_t *buf)
 763 {
 764         uint64_t num = aligned_read64ne(buf);
 765         return conv64be(num);
 766 }
 767
 768
 769 static inline uint64_t
 770 aligned_read64le(const uint8_t *buf)
 771 {
 772         uint64_t num = aligned_read64ne(buf);
 773         return conv64le(num);
 774 }
 775
 776
 777 // These need to be macros like in the unaligned case.
 778 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
 779 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
 780 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
 781 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
 782 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
 783 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
 784
 785
 786 ////////////////////
 787 // Bit operations //
 788 ////////////////////
 789
 790 static inline uint32_t
 791 bsr32(uint32_t n)
 792 {
 793         // Check for ICC first, since it tends to define __GNUC__ too.
 794 #if defined(__INTEL_COMPILER)
 795         return _bit_scan_reverse(n);
 796
 797 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
 798         // GCC >= 3.4 has __builtin_clz(), which gives good results on
 799         // multiple architectures. On x86, __builtin_clz() ^ 31U becomes
 800         // either plain BSR (so the XOR gets optimized away) or LZCNT and
 801         // XOR (if -march indicates that SSE4a instructions are supported).
 802         return (uint32_t)__builtin_clz(n) ^ 31U;
 803
 804 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 805         uint32_t i;
 806         __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
 807         return i;
 808
 809 #elif defined(_MSC_VER)
 810         unsigned long i;
 811         _BitScanReverse(&i, n);
 812         return i;
 813
 814 #else
 815         uint32_t i = 31;
 816
 817         if ((n & 0xFFFF0000) == 0) {
 818                 n <<= 16;
 819                 i = 15;
 820         }
 821
 822         if ((n & 0xFF000000) == 0) {
 823                 n <<= 8;
 824                 i -= 8;
 825         }
 826
 827         if ((n & 0xF0000000) == 0) {
 828                 n <<= 4;
 829                 i -= 4;
 830         }
 831
 832         if ((n & 0xC0000000) == 0) {
 833                 n <<= 2;
 834                 i -= 2;
 835         }
 836
 837         if ((n & 0x80000000) == 0)
 838                 --i;
 839
 840         return i;
 841 #endif
 842 }
 843
 844
 845 static inline uint32_t
 846 clz32(uint32_t n)
 847 {
 848 #if defined(__INTEL_COMPILER)
 849         return _bit_scan_reverse(n) ^ 31U;
 850
 851 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
 852         return (uint32_t)__builtin_clz(n);
 853
 854 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 855         uint32_t i;
 856         __asm__("bsrl %1, %0\n\t"
 857                 "xorl $31, %0"
 858                 : "=r" (i) : "rm" (n));
 859         return i;
 860
 861 #elif defined(_MSC_VER)
 862         unsigned long i;
 863         _BitScanReverse(&i, n);
 864         return i ^ 31U;
 865
 866 #else
 867         uint32_t i = 0;
 868
 869         if ((n & 0xFFFF0000) == 0) {
 870                 n <<= 16;
 871                 i = 16;
 872         }
 873
 874         if ((n & 0xFF000000) == 0) {
 875                 n <<= 8;
 876                 i += 8;
 877         }
 878
 879         if ((n & 0xF0000000) == 0) {
 880                 n <<= 4;
 881                 i += 4;
 882         }
 883
 884         if ((n & 0xC0000000) == 0) {
 885                 n <<= 2;
 886                 i += 2;
 887         }
 888
 889         if ((n & 0x80000000) == 0)
 890                 ++i;
 891
 892         return i;
 893 #endif
 894 }
 895
 896
 897 static inline uint32_t
 898 ctz32(uint32_t n)
 899 {
 900 #if defined(__INTEL_COMPILER)
 901         return _bit_scan_forward(n);
 902
 903 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
 904         return (uint32_t)__builtin_ctz(n);
 905
 906 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 907         uint32_t i;
 908         __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
 909         return i;
 910
 911 #elif defined(_MSC_VER)
 912         unsigned long i;
 913         _BitScanForward(&i, n);
 914         return i;
 915
 916 #else
 917         uint32_t i = 0;
 918
 919         if ((n & 0x0000FFFF) == 0) {
 920                 n >>= 16;
 921                 i = 16;
 922         }
 923
 924         if ((n & 0x000000FF) == 0) {
 925                 n >>= 8;
 926                 i += 8;
 927         }
 928
 929         if ((n & 0x0000000F) == 0) {
 930                 n >>= 4;
 931                 i += 4;
 932         }
 933
 934         if ((n & 0x00000003) == 0) {
 935                 n >>= 2;
 936                 i += 2;
 937         }
 938
 939         if ((n & 0x00000001) == 0)
 940                 ++i;
 941
 942         return i;
 943 #endif
 944 }
 945
 946 #define bsf32 ctz32
 947
 948 #endif