b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | /////////////////////////////////////////////////////////////////////////////// |
| 2 | // |
| 3 | /// \file tuklib_integer.h |
| 4 | /// \brief Various integer and bit operations |
| 5 | /// |
| 6 | /// This file provides macros or functions to do some basic integer and bit |
| 7 | /// operations. |
| 8 | /// |
| 9 | /// Native endian inline functions (XX = 16, 32, or 64): |
| 10 | /// - Unaligned native endian reads: readXXne(ptr) |
| 11 | /// - Unaligned native endian writes: writeXXne(ptr, num) |
| 12 | /// - Aligned native endian reads: aligned_readXXne(ptr) |
| 13 | /// - Aligned native endian writes: aligned_writeXXne(ptr, num) |
| 14 | /// |
| 15 | /// Endianness-converting integer operations (these can be macros!) |
| 16 | /// (XX = 16, 32, or 64; Y = b or l): |
| 17 | /// - Byte swapping: bswapXX(num) |
| 18 | /// - Byte order conversions to/from native (byteswaps if Y isn't |
| 19 | /// the native endianness): convXXYe(num) |
| 20 | /// - Unaligned reads (16/32-bit only): readXXYe(ptr) |
| 21 | /// - Unaligned writes (16/32-bit only): writeXXYe(ptr, num) |
| 22 | /// - Aligned reads: aligned_readXXYe(ptr) |
| 23 | /// - Aligned writes: aligned_writeXXYe(ptr, num) |
| 24 | /// |
| 25 | /// Since the above can macros, the arguments should have no side effects |
| 26 | /// because they may be evaluated more than once. |
| 27 | /// |
| 28 | /// Bit scan operations for non-zero 32-bit integers (inline functions): |
| 29 | /// - Bit scan reverse (find highest non-zero bit): bsr32(num) |
| 30 | /// - Count leading zeros: clz32(num) |
| 31 | /// - Count trailing zeros: ctz32(num) |
| 32 | /// - Bit scan forward (simply an alias for ctz32()): bsf32(num) |
| 33 | /// |
| 34 | /// The above bit scan operations return 0-31. If num is zero, |
| 35 | /// the result is undefined. |
| 36 | // |
| 37 | // Authors: Lasse Collin |
| 38 | // Joachim Henke |
| 39 | // |
| 40 | // This file has been put into the public domain. |
| 41 | // You can do whatever you want with this file. |
| 42 | // |
| 43 | /////////////////////////////////////////////////////////////////////////////// |
| 44 | |
| 45 | #ifndef TUKLIB_INTEGER_H |
| 46 | #define TUKLIB_INTEGER_H |
| 47 | |
| 48 | #include "tuklib_common.h" |
| 49 | #include <string.h> |
| 50 | |
| 51 | // Newer Intel C compilers require immintrin.h for _bit_scan_reverse() |
| 52 | // and such functions. |
| 53 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) |
| 54 | # include <immintrin.h> |
| 55 | #endif |
| 56 | |
| 57 | |
| 58 | /////////////////// |
| 59 | // Byte swapping // |
| 60 | /////////////////// |
| 61 | |
| 62 | #if defined(HAVE___BUILTIN_BSWAPXX) |
| 63 | // GCC >= 4.8 and Clang |
| 64 | # define bswap16(n) __builtin_bswap16(n) |
| 65 | # define bswap32(n) __builtin_bswap32(n) |
| 66 | # define bswap64(n) __builtin_bswap64(n) |
| 67 | |
| 68 | #elif defined(HAVE_BYTESWAP_H) |
| 69 | // glibc, uClibc, dietlibc |
| 70 | # include <byteswap.h> |
| 71 | # ifdef HAVE_BSWAP_16 |
| 72 | # define bswap16(num) bswap_16(num) |
| 73 | # endif |
| 74 | # ifdef HAVE_BSWAP_32 |
| 75 | # define bswap32(num) bswap_32(num) |
| 76 | # endif |
| 77 | # ifdef HAVE_BSWAP_64 |
| 78 | # define bswap64(num) bswap_64(num) |
| 79 | # endif |
| 80 | |
| 81 | #elif defined(HAVE_SYS_ENDIAN_H) |
| 82 | // *BSDs and Darwin |
| 83 | # include <sys/endian.h> |
| 84 | |
| 85 | #elif defined(HAVE_SYS_BYTEORDER_H) |
| 86 | // Solaris |
| 87 | # include <sys/byteorder.h> |
| 88 | # ifdef BSWAP_16 |
| 89 | # define bswap16(num) BSWAP_16(num) |
| 90 | # endif |
| 91 | # ifdef BSWAP_32 |
| 92 | # define bswap32(num) BSWAP_32(num) |
| 93 | # endif |
| 94 | # ifdef BSWAP_64 |
| 95 | # define bswap64(num) BSWAP_64(num) |
| 96 | # endif |
| 97 | # ifdef BE_16 |
| 98 | # define conv16be(num) BE_16(num) |
| 99 | # endif |
| 100 | # ifdef BE_32 |
| 101 | # define conv32be(num) BE_32(num) |
| 102 | # endif |
| 103 | # ifdef BE_64 |
| 104 | # define conv64be(num) BE_64(num) |
| 105 | # endif |
| 106 | # ifdef LE_16 |
| 107 | # define conv16le(num) LE_16(num) |
| 108 | # endif |
| 109 | # ifdef LE_32 |
| 110 | # define conv32le(num) LE_32(num) |
| 111 | # endif |
| 112 | # ifdef LE_64 |
| 113 | # define conv64le(num) LE_64(num) |
| 114 | # endif |
| 115 | #endif |
| 116 | |
| 117 | #ifndef bswap16 |
| 118 | # define bswap16(n) (uint16_t)( \ |
| 119 | (((n) & 0x00FFU) << 8) \ |
| 120 | | (((n) & 0xFF00U) >> 8) \ |
| 121 | ) |
| 122 | #endif |
| 123 | |
| 124 | #ifndef bswap32 |
| 125 | # define bswap32(n) (uint32_t)( \ |
| 126 | (((n) & UINT32_C(0x000000FF)) << 24) \ |
| 127 | | (((n) & UINT32_C(0x0000FF00)) << 8) \ |
| 128 | | (((n) & UINT32_C(0x00FF0000)) >> 8) \ |
| 129 | | (((n) & UINT32_C(0xFF000000)) >> 24) \ |
| 130 | ) |
| 131 | #endif |
| 132 | |
| 133 | #ifndef bswap64 |
| 134 | # define bswap64(n) (uint64_t)( \ |
| 135 | (((n) & UINT64_C(0x00000000000000FF)) << 56) \ |
| 136 | | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ |
| 137 | | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ |
| 138 | | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ |
| 139 | | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ |
| 140 | | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ |
| 141 | | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ |
| 142 | | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ |
| 143 | ) |
| 144 | #endif |
| 145 | |
| 146 | // Define conversion macros using the basic byte swapping macros. |
| 147 | #ifdef WORDS_BIGENDIAN |
| 148 | # ifndef conv16be |
| 149 | # define conv16be(num) ((uint16_t)(num)) |
| 150 | # endif |
| 151 | # ifndef conv32be |
| 152 | # define conv32be(num) ((uint32_t)(num)) |
| 153 | # endif |
| 154 | # ifndef conv64be |
| 155 | # define conv64be(num) ((uint64_t)(num)) |
| 156 | # endif |
| 157 | # ifndef conv16le |
| 158 | # define conv16le(num) bswap16(num) |
| 159 | # endif |
| 160 | # ifndef conv32le |
| 161 | # define conv32le(num) bswap32(num) |
| 162 | # endif |
| 163 | # ifndef conv64le |
| 164 | # define conv64le(num) bswap64(num) |
| 165 | # endif |
| 166 | #else |
| 167 | # ifndef conv16be |
| 168 | # define conv16be(num) bswap16(num) |
| 169 | # endif |
| 170 | # ifndef conv32be |
| 171 | # define conv32be(num) bswap32(num) |
| 172 | # endif |
| 173 | # ifndef conv64be |
| 174 | # define conv64be(num) bswap64(num) |
| 175 | # endif |
| 176 | # ifndef conv16le |
| 177 | # define conv16le(num) ((uint16_t)(num)) |
| 178 | # endif |
| 179 | # ifndef conv32le |
| 180 | # define conv32le(num) ((uint32_t)(num)) |
| 181 | # endif |
| 182 | # ifndef conv64le |
| 183 | # define conv64le(num) ((uint64_t)(num)) |
| 184 | # endif |
| 185 | #endif |
| 186 | |
| 187 | |
| 188 | //////////////////////////////// |
| 189 | // Unaligned reads and writes // |
| 190 | //////////////////////////////// |
| 191 | |
| 192 | // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer |
| 193 | // is bad even if the uint8_pointer is properly aligned because this kind |
| 194 | // of casts break strict aliasing rules and result in undefined behavior. |
| 195 | // With unaligned pointers it's even worse: compilers may emit vector |
| 196 | // instructions that require aligned pointers even if non-vector |
| 197 | // instructions work with unaligned pointers. |
| 198 | // |
| 199 | // Using memcpy() is the standard compliant way to do unaligned access. |
| 200 | // Many modern compilers inline it so there is no function call overhead. |
| 201 | // For those compilers that don't handle the memcpy() method well, the |
| 202 | // old casting method (that violates strict aliasing) can be requested at |
| 203 | // build time. A third method, casting to a packed struct, would also be |
| 204 | // an option but isn't provided to keep things simpler (it's already a mess). |
| 205 | // Hopefully this is flexible enough in practice. |
| 206 | |
| 207 | static inline uint16_t |
| 208 | read16ne(const uint8_t *buf) |
| 209 | { |
| 210 | #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
| 211 | && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) |
| 212 | return *(const uint16_t *)buf; |
| 213 | #else |
| 214 | uint16_t num; |
| 215 | memcpy(&num, buf, sizeof(num)); |
| 216 | return num; |
| 217 | #endif |
| 218 | } |
| 219 | |
| 220 | |
| 221 | static inline uint32_t |
| 222 | read32ne(const uint8_t *buf) |
| 223 | { |
| 224 | #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
| 225 | && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) |
| 226 | return *(const uint32_t *)buf; |
| 227 | #else |
| 228 | uint32_t num; |
| 229 | memcpy(&num, buf, sizeof(num)); |
| 230 | return num; |
| 231 | #endif |
| 232 | } |
| 233 | |
| 234 | |
| 235 | static inline uint64_t |
| 236 | read64ne(const uint8_t *buf) |
| 237 | { |
| 238 | #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
| 239 | && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) |
| 240 | return *(const uint64_t *)buf; |
| 241 | #else |
| 242 | uint64_t num; |
| 243 | memcpy(&num, buf, sizeof(num)); |
| 244 | return num; |
| 245 | #endif |
| 246 | } |
| 247 | |
| 248 | |
| 249 | static inline void |
| 250 | write16ne(uint8_t *buf, uint16_t num) |
| 251 | { |
| 252 | #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
| 253 | && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) |
| 254 | *(uint16_t *)buf = num; |
| 255 | #else |
| 256 | memcpy(buf, &num, sizeof(num)); |
| 257 | #endif |
| 258 | return; |
| 259 | } |
| 260 | |
| 261 | |
| 262 | static inline void |
| 263 | write32ne(uint8_t *buf, uint32_t num) |
| 264 | { |
| 265 | #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
| 266 | && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) |
| 267 | *(uint32_t *)buf = num; |
| 268 | #else |
| 269 | memcpy(buf, &num, sizeof(num)); |
| 270 | #endif |
| 271 | return; |
| 272 | } |
| 273 | |
| 274 | |
| 275 | static inline void |
| 276 | write64ne(uint8_t *buf, uint64_t num) |
| 277 | { |
| 278 | #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
| 279 | && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) |
| 280 | *(uint64_t *)buf = num; |
| 281 | #else |
| 282 | memcpy(buf, &num, sizeof(num)); |
| 283 | #endif |
| 284 | return; |
| 285 | } |
| 286 | |
| 287 | |
| 288 | static inline uint16_t |
| 289 | read16be(const uint8_t *buf) |
| 290 | { |
| 291 | #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) |
| 292 | uint16_t num = read16ne(buf); |
| 293 | return conv16be(num); |
| 294 | #else |
| 295 | uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; |
| 296 | return num; |
| 297 | #endif |
| 298 | } |
| 299 | |
| 300 | |
| 301 | static inline uint16_t |
| 302 | read16le(const uint8_t *buf) |
| 303 | { |
| 304 | #if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) |
| 305 | uint16_t num = read16ne(buf); |
| 306 | return conv16le(num); |
| 307 | #else |
| 308 | uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); |
| 309 | return num; |
| 310 | #endif |
| 311 | } |
| 312 | |
| 313 | |
| 314 | static inline uint32_t |
| 315 | read32be(const uint8_t *buf) |
| 316 | { |
| 317 | #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) |
| 318 | uint32_t num = read32ne(buf); |
| 319 | return conv32be(num); |
| 320 | #else |
| 321 | uint32_t num = (uint32_t)buf[0] << 24; |
| 322 | num |= (uint32_t)buf[1] << 16; |
| 323 | num |= (uint32_t)buf[2] << 8; |
| 324 | num |= (uint32_t)buf[3]; |
| 325 | return num; |
| 326 | #endif |
| 327 | } |
| 328 | |
| 329 | |
| 330 | static inline uint32_t |
| 331 | read32le(const uint8_t *buf) |
| 332 | { |
| 333 | #if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) |
| 334 | uint32_t num = read32ne(buf); |
| 335 | return conv32le(num); |
| 336 | #else |
| 337 | uint32_t num = (uint32_t)buf[0]; |
| 338 | num |= (uint32_t)buf[1] << 8; |
| 339 | num |= (uint32_t)buf[2] << 16; |
| 340 | num |= (uint32_t)buf[3] << 24; |
| 341 | return num; |
| 342 | #endif |
| 343 | } |
| 344 | |
| 345 | |
| 346 | // NOTE: Possible byte swapping must be done in a macro to allow the compiler |
| 347 | // to optimize byte swapping of constants when using glibc's or *BSD's |
| 348 | // byte swapping macros. The actual write is done in an inline function |
| 349 | // to make type checking of the buf pointer possible. |
| 350 | #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) |
| 351 | # define write16be(buf, num) write16ne(buf, conv16be(num)) |
| 352 | # define write32be(buf, num) write32ne(buf, conv32be(num)) |
| 353 | #endif |
| 354 | |
| 355 | #if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) |
| 356 | # define write16le(buf, num) write16ne(buf, conv16le(num)) |
| 357 | # define write32le(buf, num) write32ne(buf, conv32le(num)) |
| 358 | #endif |
| 359 | |
| 360 | |
| 361 | #ifndef write16be |
| 362 | static inline void |
| 363 | write16be(uint8_t *buf, uint16_t num) |
| 364 | { |
| 365 | buf[0] = (uint8_t)(num >> 8); |
| 366 | buf[1] = (uint8_t)num; |
| 367 | return; |
| 368 | } |
| 369 | #endif |
| 370 | |
| 371 | |
| 372 | #ifndef write16le |
| 373 | static inline void |
| 374 | write16le(uint8_t *buf, uint16_t num) |
| 375 | { |
| 376 | buf[0] = (uint8_t)num; |
| 377 | buf[1] = (uint8_t)(num >> 8); |
| 378 | return; |
| 379 | } |
| 380 | #endif |
| 381 | |
| 382 | |
| 383 | #ifndef write32be |
| 384 | static inline void |
| 385 | write32be(uint8_t *buf, uint32_t num) |
| 386 | { |
| 387 | buf[0] = (uint8_t)(num >> 24); |
| 388 | buf[1] = (uint8_t)(num >> 16); |
| 389 | buf[2] = (uint8_t)(num >> 8); |
| 390 | buf[3] = (uint8_t)num; |
| 391 | return; |
| 392 | } |
| 393 | #endif |
| 394 | |
| 395 | |
| 396 | #ifndef write32le |
| 397 | static inline void |
| 398 | write32le(uint8_t *buf, uint32_t num) |
| 399 | { |
| 400 | buf[0] = (uint8_t)num; |
| 401 | buf[1] = (uint8_t)(num >> 8); |
| 402 | buf[2] = (uint8_t)(num >> 16); |
| 403 | buf[3] = (uint8_t)(num >> 24); |
| 404 | return; |
| 405 | } |
| 406 | #endif |
| 407 | |
| 408 | |
| 409 | ////////////////////////////// |
| 410 | // Aligned reads and writes // |
| 411 | ////////////////////////////// |
| 412 | |
| 413 | // Separate functions for aligned reads and writes are provided since on |
| 414 | // strict-align archs aligned access is much faster than unaligned access. |
| 415 | // |
| 416 | // Just like in the unaligned case, memcpy() is needed to avoid |
| 417 | // strict aliasing violations. However, on archs that don't support |
| 418 | // unaligned access the compiler cannot know that the pointers given |
| 419 | // to memcpy() are aligned which results in slow code. As of C11 there is |
| 420 | // no standard way to tell the compiler that we know that the address is |
| 421 | // aligned but some compilers have language extensions to do that. With |
| 422 | // such language extensions the memcpy() method gives excellent results. |
| 423 | // |
| 424 | // What to do on a strict-align system when no known language extentensions |
| 425 | // are available? Falling back to byte-by-byte access would be safe but ruin |
| 426 | // optimizations that have been made specifically with aligned access in mind. |
| 427 | // As a compromise, aligned reads will fall back to non-compliant type punning |
| 428 | // but aligned writes will be byte-by-byte, that is, fast reads are preferred |
| 429 | // over fast writes. This obviously isn't great but hopefully it's a working |
| 430 | // compromise for now. |
| 431 | // |
| 432 | // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. |
| 433 | #ifdef HAVE___BUILTIN_ASSUME_ALIGNED |
| 434 | # define tuklib_memcpy_aligned(dest, src, size) \ |
| 435 | memcpy(dest, __builtin_assume_aligned(src, size), size) |
| 436 | #else |
| 437 | # define tuklib_memcpy_aligned(dest, src, size) \ |
| 438 | memcpy(dest, src, size) |
| 439 | # ifndef TUKLIB_FAST_UNALIGNED_ACCESS |
| 440 | # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 |
| 441 | # endif |
| 442 | #endif |
| 443 | |
| 444 | |
| 445 | static inline uint16_t |
| 446 | aligned_read16ne(const uint8_t *buf) |
| 447 | { |
| 448 | #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ |
| 449 | || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) |
| 450 | return *(const uint16_t *)buf; |
| 451 | #else |
| 452 | uint16_t num; |
| 453 | tuklib_memcpy_aligned(&num, buf, sizeof(num)); |
| 454 | return num; |
| 455 | #endif |
| 456 | } |
| 457 | |
| 458 | |
| 459 | static inline uint32_t |
| 460 | aligned_read32ne(const uint8_t *buf) |
| 461 | { |
| 462 | #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ |
| 463 | || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) |
| 464 | return *(const uint32_t *)buf; |
| 465 | #else |
| 466 | uint32_t num; |
| 467 | tuklib_memcpy_aligned(&num, buf, sizeof(num)); |
| 468 | return num; |
| 469 | #endif |
| 470 | } |
| 471 | |
| 472 | |
| 473 | static inline uint64_t |
| 474 | aligned_read64ne(const uint8_t *buf) |
| 475 | { |
| 476 | #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ |
| 477 | || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) |
| 478 | return *(const uint64_t *)buf; |
| 479 | #else |
| 480 | uint64_t num; |
| 481 | tuklib_memcpy_aligned(&num, buf, sizeof(num)); |
| 482 | return num; |
| 483 | #endif |
| 484 | } |
| 485 | |
| 486 | |
| 487 | static inline void |
| 488 | aligned_write16ne(uint8_t *buf, uint16_t num) |
| 489 | { |
| 490 | #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING |
| 491 | *(uint16_t *)buf = num; |
| 492 | #else |
| 493 | tuklib_memcpy_aligned(buf, &num, sizeof(num)); |
| 494 | #endif |
| 495 | return; |
| 496 | } |
| 497 | |
| 498 | |
| 499 | static inline void |
| 500 | aligned_write32ne(uint8_t *buf, uint32_t num) |
| 501 | { |
| 502 | #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING |
| 503 | *(uint32_t *)buf = num; |
| 504 | #else |
| 505 | tuklib_memcpy_aligned(buf, &num, sizeof(num)); |
| 506 | #endif |
| 507 | return; |
| 508 | } |
| 509 | |
| 510 | |
| 511 | static inline void |
| 512 | aligned_write64ne(uint8_t *buf, uint64_t num) |
| 513 | { |
| 514 | #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING |
| 515 | *(uint64_t *)buf = num; |
| 516 | #else |
| 517 | tuklib_memcpy_aligned(buf, &num, sizeof(num)); |
| 518 | #endif |
| 519 | return; |
| 520 | } |
| 521 | |
| 522 | |
| 523 | static inline uint16_t |
| 524 | aligned_read16be(const uint8_t *buf) |
| 525 | { |
| 526 | uint16_t num = aligned_read16ne(buf); |
| 527 | return conv16be(num); |
| 528 | } |
| 529 | |
| 530 | |
| 531 | static inline uint16_t |
| 532 | aligned_read16le(const uint8_t *buf) |
| 533 | { |
| 534 | uint16_t num = aligned_read16ne(buf); |
| 535 | return conv16le(num); |
| 536 | } |
| 537 | |
| 538 | |
| 539 | static inline uint32_t |
| 540 | aligned_read32be(const uint8_t *buf) |
| 541 | { |
| 542 | uint32_t num = aligned_read32ne(buf); |
| 543 | return conv32be(num); |
| 544 | } |
| 545 | |
| 546 | |
| 547 | static inline uint32_t |
| 548 | aligned_read32le(const uint8_t *buf) |
| 549 | { |
| 550 | uint32_t num = aligned_read32ne(buf); |
| 551 | return conv32le(num); |
| 552 | } |
| 553 | |
| 554 | |
| 555 | static inline uint64_t |
| 556 | aligned_read64be(const uint8_t *buf) |
| 557 | { |
| 558 | uint64_t num = aligned_read64ne(buf); |
| 559 | return conv64be(num); |
| 560 | } |
| 561 | |
| 562 | |
| 563 | static inline uint64_t |
| 564 | aligned_read64le(const uint8_t *buf) |
| 565 | { |
| 566 | uint64_t num = aligned_read64ne(buf); |
| 567 | return conv64le(num); |
| 568 | } |
| 569 | |
| 570 | |
| 571 | // These need to be macros like in the unaligned case. |
| 572 | #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num)) |
| 573 | #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num)) |
| 574 | #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num)) |
| 575 | #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num)) |
| 576 | #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num)) |
| 577 | #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num)) |
| 578 | |
| 579 | |
| 580 | //////////////////// |
| 581 | // Bit operations // |
| 582 | //////////////////// |
| 583 | |
| 584 | static inline uint32_t |
| 585 | bsr32(uint32_t n) |
| 586 | { |
| 587 | // Check for ICC first, since it tends to define __GNUC__ too. |
| 588 | #if defined(__INTEL_COMPILER) |
| 589 | return _bit_scan_reverse(n); |
| 590 | |
| 591 | #elif TUKLIB_GNUC_REQ(3, 4) && UINT_MAX == UINT32_MAX |
| 592 | // GCC >= 3.4 has __builtin_clz(), which gives good results on |
| 593 | // multiple architectures. On x86, __builtin_clz() ^ 31U becomes |
| 594 | // either plain BSR (so the XOR gets optimized away) or LZCNT and |
| 595 | // XOR (if -march indicates that SSE4a instructions are supported). |
| 596 | return (uint32_t)__builtin_clz(n) ^ 31U; |
| 597 | |
| 598 | #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
| 599 | uint32_t i; |
| 600 | __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n)); |
| 601 | return i; |
| 602 | |
| 603 | #elif defined(_MSC_VER) |
| 604 | unsigned long i; |
| 605 | _BitScanReverse(&i, n); |
| 606 | return i; |
| 607 | |
| 608 | #else |
| 609 | uint32_t i = 31; |
| 610 | |
| 611 | if ((n & 0xFFFF0000) == 0) { |
| 612 | n <<= 16; |
| 613 | i = 15; |
| 614 | } |
| 615 | |
| 616 | if ((n & 0xFF000000) == 0) { |
| 617 | n <<= 8; |
| 618 | i -= 8; |
| 619 | } |
| 620 | |
| 621 | if ((n & 0xF0000000) == 0) { |
| 622 | n <<= 4; |
| 623 | i -= 4; |
| 624 | } |
| 625 | |
| 626 | if ((n & 0xC0000000) == 0) { |
| 627 | n <<= 2; |
| 628 | i -= 2; |
| 629 | } |
| 630 | |
| 631 | if ((n & 0x80000000) == 0) |
| 632 | --i; |
| 633 | |
| 634 | return i; |
| 635 | #endif |
| 636 | } |
| 637 | |
| 638 | |
| 639 | static inline uint32_t |
| 640 | clz32(uint32_t n) |
| 641 | { |
| 642 | #if defined(__INTEL_COMPILER) |
| 643 | return _bit_scan_reverse(n) ^ 31U; |
| 644 | |
| 645 | #elif TUKLIB_GNUC_REQ(3, 4) && UINT_MAX == UINT32_MAX |
| 646 | return (uint32_t)__builtin_clz(n); |
| 647 | |
| 648 | #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
| 649 | uint32_t i; |
| 650 | __asm__("bsrl %1, %0\n\t" |
| 651 | "xorl $31, %0" |
| 652 | : "=r" (i) : "rm" (n)); |
| 653 | return i; |
| 654 | |
| 655 | #elif defined(_MSC_VER) |
| 656 | unsigned long i; |
| 657 | _BitScanReverse(&i, n); |
| 658 | return i ^ 31U; |
| 659 | |
| 660 | #else |
| 661 | uint32_t i = 0; |
| 662 | |
| 663 | if ((n & 0xFFFF0000) == 0) { |
| 664 | n <<= 16; |
| 665 | i = 16; |
| 666 | } |
| 667 | |
| 668 | if ((n & 0xFF000000) == 0) { |
| 669 | n <<= 8; |
| 670 | i += 8; |
| 671 | } |
| 672 | |
| 673 | if ((n & 0xF0000000) == 0) { |
| 674 | n <<= 4; |
| 675 | i += 4; |
| 676 | } |
| 677 | |
| 678 | if ((n & 0xC0000000) == 0) { |
| 679 | n <<= 2; |
| 680 | i += 2; |
| 681 | } |
| 682 | |
| 683 | if ((n & 0x80000000) == 0) |
| 684 | ++i; |
| 685 | |
| 686 | return i; |
| 687 | #endif |
| 688 | } |
| 689 | |
| 690 | |
| 691 | static inline uint32_t |
| 692 | ctz32(uint32_t n) |
| 693 | { |
| 694 | #if defined(__INTEL_COMPILER) |
| 695 | return _bit_scan_forward(n); |
| 696 | |
| 697 | #elif TUKLIB_GNUC_REQ(3, 4) && UINT_MAX >= UINT32_MAX |
| 698 | return (uint32_t)__builtin_ctz(n); |
| 699 | |
| 700 | #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
| 701 | uint32_t i; |
| 702 | __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n)); |
| 703 | return i; |
| 704 | |
| 705 | #elif defined(_MSC_VER) |
| 706 | unsigned long i; |
| 707 | _BitScanForward(&i, n); |
| 708 | return i; |
| 709 | |
| 710 | #else |
| 711 | uint32_t i = 0; |
| 712 | |
| 713 | if ((n & 0x0000FFFF) == 0) { |
| 714 | n >>= 16; |
| 715 | i = 16; |
| 716 | } |
| 717 | |
| 718 | if ((n & 0x000000FF) == 0) { |
| 719 | n >>= 8; |
| 720 | i += 8; |
| 721 | } |
| 722 | |
| 723 | if ((n & 0x0000000F) == 0) { |
| 724 | n >>= 4; |
| 725 | i += 4; |
| 726 | } |
| 727 | |
| 728 | if ((n & 0x00000003) == 0) { |
| 729 | n >>= 2; |
| 730 | i += 2; |
| 731 | } |
| 732 | |
| 733 | if ((n & 0x00000001) == 0) |
| 734 | ++i; |
| 735 | |
| 736 | return i; |
| 737 | #endif |
| 738 | } |
| 739 | |
| 740 | #define bsf32 ctz32 |
| 741 | |
| 742 | #endif |