src/common/tuklib_integer.h

   1 ///////////////////////////////////////////////////////////////////////////////
   2 //
   3 /// \file       tuklib_integer.h
   4 /// \brief      Various integer and bit operations
   5 ///
   6 /// This file provides macros or functions to do some basic integer and bit
   7 /// operations.
   8 ///
   9 /// Native endian inline functions (XX = 16, 32, or 64):
  10 ///   - Unaligned native endian reads: readXXne(ptr)
  11 ///   - Unaligned native endian writes: writeXXne(ptr, num)
  12 ///   - Aligned native endian reads: aligned_readXXne(ptr)
  13 ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
  14 ///
  15 /// Endianness-converting integer operations (these can be macros!)
  16 /// (XX = 16, 32, or 64; Y = b or l):
  17 ///   - Byte swapping: bswapXX(num)
  18 ///   - Byte order conversions to/from native (byteswaps if Y isn't
  19 ///     the native endianness): convXXYe(num)
  20 ///   - Unaligned reads: readXXYe(ptr)
  21 ///   - Unaligned writes: writeXXYe(ptr, num)
  22 ///   - Aligned reads: aligned_readXXYe(ptr)
  23 ///   - Aligned writes: aligned_writeXXYe(ptr, num)
  24 ///
  25 /// Since the above can macros, the arguments should have no side effects
  26 /// because they may be evaluated more than once.
  27 ///
  28 /// Bit scan operations for non-zero 32-bit integers (inline functions):
  29 ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
  30 ///   - Count leading zeros: clz32(num)
  31 ///   - Count trailing zeros: ctz32(num)
  32 ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
  33 ///
  34 /// The above bit scan operations return 0-31. If num is zero,
  35 /// the result is undefined.
  36 //
  37 //  Authors:    Lasse Collin
  38 //              Joachim Henke
  39 //
  40 //  This file has been put into the public domain.
  41 //  You can do whatever you want with this file.
  42 //
  43 ///////////////////////////////////////////////////////////////////////////////
  44
  45 #ifndef TUKLIB_INTEGER_H
  46 #define TUKLIB_INTEGER_H
  47
  48 #include "tuklib_common.h"
  49 #include <string.h>
  50
  51 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
  52 // and such functions.
  53 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
  54 #       include <immintrin.h>
  55 // Only include <intrin.h> when it is needed. GCC and Clang can both
  56 // use __builtin's, so we only need Windows instrincs when using MSVC.
  57 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
  58 // cases explicitly.
  59 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
  60 #       include <intrin.h>
  61 #endif
  62
  63
  64 ///////////////////
  65 // Byte swapping //
  66 ///////////////////
  67
  68 #if defined(HAVE___BUILTIN_BSWAPXX)
  69         // GCC >= 4.8 and Clang
  70 #       define bswap16(n) __builtin_bswap16(n)
  71 #       define bswap32(n) __builtin_bswap32(n)
  72 #       define bswap64(n) __builtin_bswap64(n)
  73
  74 #elif defined(HAVE_BYTESWAP_H)
  75         // glibc, uClibc, dietlibc
  76 #       include <byteswap.h>
  77 #       ifdef HAVE_BSWAP_16
  78 #               define bswap16(num) bswap_16(num)
  79 #       endif
  80 #       ifdef HAVE_BSWAP_32
  81 #               define bswap32(num) bswap_32(num)
  82 #       endif
  83 #       ifdef HAVE_BSWAP_64
  84 #               define bswap64(num) bswap_64(num)
  85 #       endif
  86
  87 #elif defined(HAVE_SYS_ENDIAN_H)
  88         // *BSDs and Darwin
  89 #       include <sys/endian.h>
  90
  91 #elif defined(HAVE_SYS_BYTEORDER_H)
  92         // Solaris
  93 #       include <sys/byteorder.h>
  94 #       ifdef BSWAP_16
  95 #               define bswap16(num) BSWAP_16(num)
  96 #       endif
  97 #       ifdef BSWAP_32
  98 #               define bswap32(num) BSWAP_32(num)
  99 #       endif
 100 #       ifdef BSWAP_64
 101 #               define bswap64(num) BSWAP_64(num)
 102 #       endif
 103 #       ifdef BE_16
 104 #               define conv16be(num) BE_16(num)
 105 #       endif
 106 #       ifdef BE_32
 107 #               define conv32be(num) BE_32(num)
 108 #       endif
 109 #       ifdef BE_64
 110 #               define conv64be(num) BE_64(num)
 111 #       endif
 112 #       ifdef LE_16
 113 #               define conv16le(num) LE_16(num)
 114 #       endif
 115 #       ifdef LE_32
 116 #               define conv32le(num) LE_32(num)
 117 #       endif
 118 #       ifdef LE_64
 119 #               define conv64le(num) LE_64(num)
 120 #       endif
 121 #endif
 122
 123 #ifndef bswap16
 124 #       define bswap16(n) (uint16_t)( \
 125                   (((n) & 0x00FFU) << 8) \
 126                 | (((n) & 0xFF00U) >> 8) \
 127         )
 128 #endif
 129
 130 #ifndef bswap32
 131 #       define bswap32(n) (uint32_t)( \
 132                   (((n) & UINT32_C(0x000000FF)) << 24) \
 133                 | (((n) & UINT32_C(0x0000FF00)) << 8) \
 134                 | (((n) & UINT32_C(0x00FF0000)) >> 8) \
 135                 | (((n) & UINT32_C(0xFF000000)) >> 24) \
 136         )
 137 #endif
 138
 139 #ifndef bswap64
 140 #       define bswap64(n) (uint64_t)( \
 141                   (((n) & UINT64_C(0x00000000000000FF)) << 56) \
 142                 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \
 143                 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
 144                 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \
 145                 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
 146                 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
 147                 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
 148                 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
 149         )
 150 #endif
 151
 152 // Define conversion macros using the basic byte swapping macros.
 153 #ifdef WORDS_BIGENDIAN
 154 #       ifndef conv16be
 155 #               define conv16be(num) ((uint16_t)(num))
 156 #       endif
 157 #       ifndef conv32be
 158 #               define conv32be(num) ((uint32_t)(num))
 159 #       endif
 160 #       ifndef conv64be
 161 #               define conv64be(num) ((uint64_t)(num))
 162 #       endif
 163 #       ifndef conv16le
 164 #               define conv16le(num) bswap16(num)
 165 #       endif
 166 #       ifndef conv32le
 167 #               define conv32le(num) bswap32(num)
 168 #       endif
 169 #       ifndef conv64le
 170 #               define conv64le(num) bswap64(num)
 171 #       endif
 172 #else
 173 #       ifndef conv16be
 174 #               define conv16be(num) bswap16(num)
 175 #       endif
 176 #       ifndef conv32be
 177 #               define conv32be(num) bswap32(num)
 178 #       endif
 179 #       ifndef conv64be
 180 #               define conv64be(num) bswap64(num)
 181 #       endif
 182 #       ifndef conv16le
 183 #               define conv16le(num) ((uint16_t)(num))
 184 #       endif
 185 #       ifndef conv32le
 186 #               define conv32le(num) ((uint32_t)(num))
 187 #       endif
 188 #       ifndef conv64le
 189 #               define conv64le(num) ((uint64_t)(num))
 190 #       endif
 191 #endif
 192
 193
 194 ////////////////////////////////
 195 // Unaligned reads and writes //
 196 ////////////////////////////////
 197
 198 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
 199 // is bad even if the uint8_pointer is properly aligned because this kind
 200 // of casts break strict aliasing rules and result in undefined behavior.
 201 // With unaligned pointers it's even worse: compilers may emit vector
 202 // instructions that require aligned pointers even if non-vector
 203 // instructions work with unaligned pointers.
 204 //
 205 // Using memcpy() is the standard compliant way to do unaligned access.
 206 // Many modern compilers inline it so there is no function call overhead.
 207 // For those compilers that don't handle the memcpy() method well, the
 208 // old casting method (that violates strict aliasing) can be requested at
 209 // build time. A third method, casting to a packed struct, would also be
 210 // an option but isn't provided to keep things simpler (it's already a mess).
 211 // Hopefully this is flexible enough in practice.
 212
 213 static inline uint16_t
 214 read16ne(const uint8_t *buf)
 215 {
 216 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
 217                 && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
 218         return *(const uint16_t *)buf;
 219 #else
 220         uint16_t num;
 221         memcpy(&num, buf, sizeof(num));
 222         return num;
 223 #endif
 224 }
 225
 226
 227 static inline uint32_t
 228 read32ne(const uint8_t *buf)
 229 {
 230 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
 231                 && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
 232         return *(const uint32_t *)buf;
 233 #else
 234         uint32_t num;
 235         memcpy(&num, buf, sizeof(num));
 236         return num;
 237 #endif
 238 }
 239
 240
 241 static inline uint64_t
 242 read64ne(const uint8_t *buf)
 243 {
 244 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
 245                 && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
 246         return *(const uint64_t *)buf;
 247 #else
 248         uint64_t num;
 249         memcpy(&num, buf, sizeof(num));
 250         return num;
 251 #endif
 252 }
 253
 254
 255 static inline void
 256 write16ne(uint8_t *buf, uint16_t num)
 257 {
 258 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
 259                 && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
 260         *(uint16_t *)buf = num;
 261 #else
 262         memcpy(buf, &num, sizeof(num));
 263 #endif
 264         return;
 265 }
 266
 267
 268 static inline void
 269 write32ne(uint8_t *buf, uint32_t num)
 270 {
 271 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
 272                 && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
 273         *(uint32_t *)buf = num;
 274 #else
 275         memcpy(buf, &num, sizeof(num));
 276 #endif
 277         return;
 278 }
 279
 280
 281 static inline void
 282 write64ne(uint8_t *buf, uint64_t num)
 283 {
 284 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
 285                 && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
 286         *(uint64_t *)buf = num;
 287 #else
 288         memcpy(buf, &num, sizeof(num));
 289 #endif
 290         return;
 291 }
 292
 293
 294 static inline uint16_t
 295 read16be(const uint8_t *buf)
 296 {
 297 #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
 298         uint16_t num = read16ne(buf);
 299         return conv16be(num);
 300 #else
 301         uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
 302         return num;
 303 #endif
 304 }
 305
 306
 307 static inline uint16_t
 308 read16le(const uint8_t *buf)
 309 {
 310 #if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
 311         uint16_t num = read16ne(buf);
 312         return conv16le(num);
 313 #else
 314         uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
 315         return num;
 316 #endif
 317 }
 318
 319
 320 static inline uint32_t
 321 read32be(const uint8_t *buf)
 322 {
 323 #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
 324         uint32_t num = read32ne(buf);
 325         return conv32be(num);
 326 #else
 327         uint32_t num = (uint32_t)buf[0] << 24;
 328         num |= (uint32_t)buf[1] << 16;
 329         num |= (uint32_t)buf[2] << 8;
 330         num |= (uint32_t)buf[3];
 331         return num;
 332 #endif
 333 }
 334
 335
 336 static inline uint32_t
 337 read32le(const uint8_t *buf)
 338 {
 339 #if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
 340         uint32_t num = read32ne(buf);
 341         return conv32le(num);
 342 #else
 343         uint32_t num = (uint32_t)buf[0];
 344         num |= (uint32_t)buf[1] << 8;
 345         num |= (uint32_t)buf[2] << 16;
 346         num |= (uint32_t)buf[3] << 24;
 347         return num;
 348 #endif
 349 }
 350
 351
 352 static inline uint64_t
 353 read64be(const uint8_t *buf)
 354 {
 355 #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
 356         uint64_t num = read64ne(buf);
 357         return conv64be(num);
 358 #else
 359         uint64_t num = (uint64_t)buf[0] << 56;
 360         num |= (uint64_t)buf[1] << 48;
 361         num |= (uint64_t)buf[2] << 40;
 362         num |= (uint64_t)buf[3] << 32;
 363         num |= (uint64_t)buf[4] << 24;
 364         num |= (uint64_t)buf[5] << 16;
 365         num |= (uint64_t)buf[6] << 8;
 366         num |= (uint64_t)buf[7];
 367         return num;
 368 #endif
 369 }
 370
 371
 372 static inline uint64_t
 373 read64le(const uint8_t *buf)
 374 {
 375 #if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
 376         uint64_t num = read64ne(buf);
 377         return conv64le(num);
 378 #else
 379         uint64_t num = (uint64_t)buf[0];
 380         num |= (uint64_t)buf[1] << 8;
 381         num |= (uint64_t)buf[2] << 16;
 382         num |= (uint64_t)buf[3] << 24;
 383         num |= (uint64_t)buf[4] << 32;
 384         num |= (uint64_t)buf[5] << 40;
 385         num |= (uint64_t)buf[6] << 48;
 386         num |= (uint64_t)buf[7] << 56;
 387         return num;
 388 #endif
 389 }
 390
 391
 392 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
 393 // to optimize byte swapping of constants when using glibc's or *BSD's
 394 // byte swapping macros. The actual write is done in an inline function
 395 // to make type checking of the buf pointer possible.
 396 #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
 397 #       define write16be(buf, num) write16ne(buf, conv16be(num))
 398 #       define write32be(buf, num) write32ne(buf, conv32be(num))
 399 #       define write64be(buf, num) write64ne(buf, conv64be(num))
 400 #endif
 401
 402 #if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
 403 #       define write16le(buf, num) write16ne(buf, conv16le(num))
 404 #       define write32le(buf, num) write32ne(buf, conv32le(num))
 405 #       define write64le(buf, num) write64ne(buf, conv64le(num))
 406 #endif
 407
 408
 409 #ifndef write16be
 410 static inline void
 411 write16be(uint8_t *buf, uint16_t num)
 412 {
 413         buf[0] = (uint8_t)(num >> 8);
 414         buf[1] = (uint8_t)num;
 415         return;
 416 }
 417 #endif
 418
 419
 420 #ifndef write16le
 421 static inline void
 422 write16le(uint8_t *buf, uint16_t num)
 423 {
 424         buf[0] = (uint8_t)num;
 425         buf[1] = (uint8_t)(num >> 8);
 426         return;
 427 }
 428 #endif
 429
 430
 431 #ifndef write32be
 432 static inline void
 433 write32be(uint8_t *buf, uint32_t num)
 434 {
 435         buf[0] = (uint8_t)(num >> 24);
 436         buf[1] = (uint8_t)(num >> 16);
 437         buf[2] = (uint8_t)(num >> 8);
 438         buf[3] = (uint8_t)num;
 439         return;
 440 }
 441 #endif
 442
 443
 444 #ifndef write32le
 445 static inline void
 446 write32le(uint8_t *buf, uint32_t num)
 447 {
 448         buf[0] = (uint8_t)num;
 449         buf[1] = (uint8_t)(num >> 8);
 450         buf[2] = (uint8_t)(num >> 16);
 451         buf[3] = (uint8_t)(num >> 24);
 452         return;
 453 }
 454 #endif
 455
 456
 457 //////////////////////////////
 458 // Aligned reads and writes //
 459 //////////////////////////////
 460
 461 // Separate functions for aligned reads and writes are provided since on
 462 // strict-align archs aligned access is much faster than unaligned access.
 463 //
 464 // Just like in the unaligned case, memcpy() is needed to avoid
 465 // strict aliasing violations. However, on archs that don't support
 466 // unaligned access the compiler cannot know that the pointers given
 467 // to memcpy() are aligned which results in slow code. As of C11 there is
 468 // no standard way to tell the compiler that we know that the address is
 469 // aligned but some compilers have language extensions to do that. With
 470 // such language extensions the memcpy() method gives excellent results.
 471 //
 472 // What to do on a strict-align system when no known language extentensions
 473 // are available? Falling back to byte-by-byte access would be safe but ruin
 474 // optimizations that have been made specifically with aligned access in mind.
 475 // As a compromise, aligned reads will fall back to non-compliant type punning
 476 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
 477 // over fast writes. This obviously isn't great but hopefully it's a working
 478 // compromise for now.
 479 //
 480 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
 481 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
 482 #       define tuklib_memcpy_aligned(dest, src, size) \
 483                 memcpy(dest, __builtin_assume_aligned(src, size), size)
 484 #else
 485 #       define tuklib_memcpy_aligned(dest, src, size) \
 486                 memcpy(dest, src, size)
 487 #       ifndef TUKLIB_FAST_UNALIGNED_ACCESS
 488 #               define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
 489 #       endif
 490 #endif
 491
 492
 493 static inline uint16_t
 494 aligned_read16ne(const uint8_t *buf)
 495 {
 496 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
 497                 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
 498         return *(const uint16_t *)buf;
 499 #else
 500         uint16_t num;
 501         tuklib_memcpy_aligned(&num, buf, sizeof(num));
 502         return num;
 503 #endif
 504 }
 505
 506
 507 static inline uint32_t
 508 aligned_read32ne(const uint8_t *buf)
 509 {
 510 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
 511                 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
 512         return *(const uint32_t *)buf;
 513 #else
 514         uint32_t num;
 515         tuklib_memcpy_aligned(&num, buf, sizeof(num));
 516         return num;
 517 #endif
 518 }
 519
 520
 521 static inline uint64_t
 522 aligned_read64ne(const uint8_t *buf)
 523 {
 524 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
 525                 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
 526         return *(const uint64_t *)buf;
 527 #else
 528         uint64_t num;
 529         tuklib_memcpy_aligned(&num, buf, sizeof(num));
 530         return num;
 531 #endif
 532 }
 533
 534
 535 static inline void
 536 aligned_write16ne(uint8_t *buf, uint16_t num)
 537 {
 538 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 539         *(uint16_t *)buf = num;
 540 #else
 541         tuklib_memcpy_aligned(buf, &num, sizeof(num));
 542 #endif
 543         return;
 544 }
 545
 546
 547 static inline void
 548 aligned_write32ne(uint8_t *buf, uint32_t num)
 549 {
 550 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 551         *(uint32_t *)buf = num;
 552 #else
 553         tuklib_memcpy_aligned(buf, &num, sizeof(num));
 554 #endif
 555         return;
 556 }
 557
 558
 559 static inline void
 560 aligned_write64ne(uint8_t *buf, uint64_t num)
 561 {
 562 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
 563         *(uint64_t *)buf = num;
 564 #else
 565         tuklib_memcpy_aligned(buf, &num, sizeof(num));
 566 #endif
 567         return;
 568 }
 569
 570
 571 static inline uint16_t
 572 aligned_read16be(const uint8_t *buf)
 573 {
 574         uint16_t num = aligned_read16ne(buf);
 575         return conv16be(num);
 576 }
 577
 578
 579 static inline uint16_t
 580 aligned_read16le(const uint8_t *buf)
 581 {
 582         uint16_t num = aligned_read16ne(buf);
 583         return conv16le(num);
 584 }
 585
 586
 587 static inline uint32_t
 588 aligned_read32be(const uint8_t *buf)
 589 {
 590         uint32_t num = aligned_read32ne(buf);
 591         return conv32be(num);
 592 }
 593
 594
 595 static inline uint32_t
 596 aligned_read32le(const uint8_t *buf)
 597 {
 598         uint32_t num = aligned_read32ne(buf);
 599         return conv32le(num);
 600 }
 601
 602
 603 static inline uint64_t
 604 aligned_read64be(const uint8_t *buf)
 605 {
 606         uint64_t num = aligned_read64ne(buf);
 607         return conv64be(num);
 608 }
 609
 610
 611 static inline uint64_t
 612 aligned_read64le(const uint8_t *buf)
 613 {
 614         uint64_t num = aligned_read64ne(buf);
 615         return conv64le(num);
 616 }
 617
 618
 619 // These need to be macros like in the unaligned case.
 620 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
 621 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
 622 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
 623 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
 624 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
 625 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
 626
 627
 628 ////////////////////
 629 // Bit operations //
 630 ////////////////////
 631
 632 static inline uint32_t
 633 bsr32(uint32_t n)
 634 {
 635         // Check for ICC first, since it tends to define __GNUC__ too.
 636 #if defined(__INTEL_COMPILER)
 637         return _bit_scan_reverse(n);
 638
 639 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
 640         // GCC >= 3.4 has __builtin_clz(), which gives good results on
 641         // multiple architectures. On x86, __builtin_clz() ^ 31U becomes
 642         // either plain BSR (so the XOR gets optimized away) or LZCNT and
 643         // XOR (if -march indicates that SSE4a instructions are supported).
 644         return (uint32_t)__builtin_clz(n) ^ 31U;
 645
 646 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 647         uint32_t i;
 648         __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
 649         return i;
 650
 651 #elif defined(_MSC_VER)
 652         unsigned long i;
 653         _BitScanReverse(&i, n);
 654         return i;
 655
 656 #else
 657         uint32_t i = 31;
 658
 659         if ((n & 0xFFFF0000) == 0) {
 660                 n <<= 16;
 661                 i = 15;
 662         }
 663
 664         if ((n & 0xFF000000) == 0) {
 665                 n <<= 8;
 666                 i -= 8;
 667         }
 668
 669         if ((n & 0xF0000000) == 0) {
 670                 n <<= 4;
 671                 i -= 4;
 672         }
 673
 674         if ((n & 0xC0000000) == 0) {
 675                 n <<= 2;
 676                 i -= 2;
 677         }
 678
 679         if ((n & 0x80000000) == 0)
 680                 --i;
 681
 682         return i;
 683 #endif
 684 }
 685
 686
 687 static inline uint32_t
 688 clz32(uint32_t n)
 689 {
 690 #if defined(__INTEL_COMPILER)
 691         return _bit_scan_reverse(n) ^ 31U;
 692
 693 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
 694         return (uint32_t)__builtin_clz(n);
 695
 696 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 697         uint32_t i;
 698         __asm__("bsrl %1, %0\n\t"
 699                 "xorl $31, %0"
 700                 : "=r" (i) : "rm" (n));
 701         return i;
 702
 703 #elif defined(_MSC_VER)
 704         unsigned long i;
 705         _BitScanReverse(&i, n);
 706         return i ^ 31U;
 707
 708 #else
 709         uint32_t i = 0;
 710
 711         if ((n & 0xFFFF0000) == 0) {
 712                 n <<= 16;
 713                 i = 16;
 714         }
 715
 716         if ((n & 0xFF000000) == 0) {
 717                 n <<= 8;
 718                 i += 8;
 719         }
 720
 721         if ((n & 0xF0000000) == 0) {
 722                 n <<= 4;
 723                 i += 4;
 724         }
 725
 726         if ((n & 0xC0000000) == 0) {
 727                 n <<= 2;
 728                 i += 2;
 729         }
 730
 731         if ((n & 0x80000000) == 0)
 732                 ++i;
 733
 734         return i;
 735 #endif
 736 }
 737
 738
 739 static inline uint32_t
 740 ctz32(uint32_t n)
 741 {
 742 #if defined(__INTEL_COMPILER)
 743         return _bit_scan_forward(n);
 744
 745 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
 746         return (uint32_t)__builtin_ctz(n);
 747
 748 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 749         uint32_t i;
 750         __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
 751         return i;
 752
 753 #elif defined(_MSC_VER)
 754         unsigned long i;
 755         _BitScanForward(&i, n);
 756         return i;
 757
 758 #else
 759         uint32_t i = 0;
 760
 761         if ((n & 0x0000FFFF) == 0) {
 762                 n >>= 16;
 763                 i = 16;
 764         }
 765
 766         if ((n & 0x000000FF) == 0) {
 767                 n >>= 8;
 768                 i += 8;
 769         }
 770
 771         if ((n & 0x0000000F) == 0) {
 772                 n >>= 4;
 773                 i += 4;
 774         }
 775
 776         if ((n & 0x00000003) == 0) {
 777                 n >>= 2;
 778                 i += 2;
 779         }
 780
 781         if ((n & 0x00000001) == 0)
 782                 ++i;
 783
 784         return i;
 785 #endif
 786 }
 787
 788 #define bsf32 ctz32
 789
 790 #endif