llvm/lib/Support/BLAKE3/README.md

   1 Implementation of BLAKE3, originating from https://github.com/BLAKE3-team/BLAKE3/tree/1.3.1/c
   2
   3 # Example
   4
   5 An example program that hashes bytes from standard input and prints the
   6 result:
   7
   8 Using the C++ API:
   9
  10 ```c++
  11 #include "llvm/Support/BLAKE3.h"
  12 #include <errno.h>
  13 #include <stdio.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16 #include <unistd.h>
  17
  18 int main() {
  19   // Initialize the hasher.
  20   llvm::BLAKE3 hasher;
  21
  22   // Read input bytes from stdin.
  23   char buf[65536];
  24   while (1) {
  25     ssize_t n = read(STDIN_FILENO, buf, sizeof(buf));
  26     if (n > 0) {
  27       hasher.update(llvm::StringRef(buf, n));
  28     } else if (n == 0) {
  29       break; // end of file
  30     } else {
  31       fprintf(stderr, "read failed: %s\n", strerror(errno));
  32       exit(1);
  33     }
  34   }
  35
  36   // Finalize the hash. Default output length is 32 bytes.
  37   auto output = hasher.final();
  38
  39   // Print the hash as hexadecimal.
  40   for (uint8_t byte : output) {
  41     printf("%02x", byte);
  42   }
  43   printf("\n");
  44   return 0;
  45 }
  46 ```
  47
  48 Using the C API:
  49
  50 ```c
  51 #include "llvm-c/blake3.h"
  52 #include <errno.h>
  53 #include <stdio.h>
  54 #include <stdlib.h>
  55 #include <string.h>
  56 #include <unistd.h>
  57
  58 int main() {
  59   // Initialize the hasher.
  60   llvm_blake3_hasher hasher;
  61   llvm_blake3_hasher_init(&hasher);
  62
  63   // Read input bytes from stdin.
  64   unsigned char buf[65536];
  65   while (1) {
  66     ssize_t n = read(STDIN_FILENO, buf, sizeof(buf));
  67     if (n > 0) {
  68       llvm_blake3_hasher_update(&hasher, buf, n);
  69     } else if (n == 0) {
  70       break; // end of file
  71     } else {
  72       fprintf(stderr, "read failed: %s\n", strerror(errno));
  73       exit(1);
  74     }
  75   }
  76
  77   // Finalize the hash. LLVM_BLAKE3_OUT_LEN is the default output length, 32 bytes.
  78   uint8_t output[LLVM_BLAKE3_OUT_LEN];
  79   llvm_blake3_hasher_finalize(&hasher, output, LLVM_BLAKE3_OUT_LEN);
  80
  81   // Print the hash as hexadecimal.
  82   for (size_t i = 0; i < LLVM_BLAKE3_OUT_LEN; i++) {
  83     printf("%02x", output[i]);
  84   }
  85   printf("\n");
  86   return 0;
  87 }
  88 ```
  89
  90 # API
  91
  92 ## The Class/Struct
  93
  94 ```c++
  95 class BLAKE3 {
  96   // API
  97 private:
  98   llvm_blake3_hasher Hasher;
  99 };
 100 ```
 101 ```c
 102 typedef struct {
 103   // private fields
 104 } llvm_blake3_hasher;
 105 ```
 106
 107 An incremental BLAKE3 hashing state, which can accept any number of
 108 updates. This implementation doesn't allocate any heap memory, but
 109 `sizeof(llvm_blake3_hasher)` itself is relatively large, currently 1912 bytes
 110 on x86-64. This size can be reduced by restricting the maximum input
 111 length, as described in Section 5.4 of [the BLAKE3
 112 spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf),
 113 but this implementation doesn't currently support that strategy.
 114
 115 ## Common API Functions
 116
 117 ```c++
 118 BLAKE3::BLAKE3();
 119
 120 void BLAKE3::init();
 121 ```
 122 ```c
 123 void llvm_blake3_hasher_init(
 124   llvm_blake3_hasher *self);
 125 ```
 126
 127 Initialize a `llvm_blake3_hasher` in the default hashing mode.
 128
 129 ---
 130
 131 ```c++
 132 void BLAKE3::update(ArrayRef<uint8_t> Data);
 133
 134 void BLAKE3::update(StringRef Str);
 135 ```
 136 ```c
 137 void llvm_blake3_hasher_update(
 138   llvm_blake3_hasher *self,
 139   const void *input,
 140   size_t input_len);
 141 ```
 142
 143 Add input to the hasher. This can be called any number of times.
 144
 145 ---
 146
 147 ```c++
 148 template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
 149 using BLAKE3Result = std::array<uint8_t, NumBytes>;
 150
 151 template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
 152 void BLAKE3::final(BLAKE3Result<NumBytes> &Result);
 153
 154 template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
 155 BLAKE3Result<NumBytes> BLAKE3::final();
 156 ```
 157 ```c
 158 void llvm_blake3_hasher_finalize(
 159   const llvm_blake3_hasher *self,
 160   uint8_t *out,
 161   size_t out_len);
 162 ```
 163
 164 Finalize the hasher and return an output of any length, given in bytes.
 165 This doesn't modify the hasher itself, and it's possible to finalize
 166 again after adding more input. The constant `LLVM_BLAKE3_OUT_LEN` provides
 167 the default output length, 32 bytes, which is recommended for most
 168 callers.
 169
 170 Outputs shorter than the default length of 32 bytes (256 bits) provide
 171 less security. An N-bit BLAKE3 output is intended to provide N bits of
 172 first and second preimage resistance and N/2 bits of collision
 173 resistance, for any N up to 256. Longer outputs don't provide any
 174 additional security.
 175
 176 Shorter BLAKE3 outputs are prefixes of longer ones. Explicitly
 177 requesting a short output is equivalent to truncating the default-length
 178 output. (Note that this is different between BLAKE2 and BLAKE3.)
 179
 180 ## Less Common API Functions
 181
 182 ```c
 183 void llvm_blake3_hasher_init_keyed(
 184   llvm_blake3_hasher *self,
 185   const uint8_t key[LLVM_BLAKE3_KEY_LEN]);
 186 ```
 187
 188 Initialize a `llvm_blake3_hasher` in the keyed hashing mode. The key must be
 189 exactly 32 bytes.
 190
 191 ---
 192
 193 ```c
 194 void llvm_blake3_hasher_init_derive_key(
 195   llvm_blake3_hasher *self,
 196   const char *context);
 197 ```
 198
 199 Initialize a `llvm_blake3_hasher` in the key derivation mode. The context
 200 string is given as an initialization parameter, and afterwards input key
 201 material should be given with `llvm_blake3_hasher_update`. The context string
 202 is a null-terminated C string which should be **hardcoded, globally
 203 unique, and application-specific**. The context string should not
 204 include any dynamic input like salts, nonces, or identifiers read from a
 205 database at runtime. A good default format for the context string is
 206 `"[application] [commit timestamp] [purpose]"`, e.g., `"example.com
 207 2019-12-25 16:18:03 session tokens v1"`.
 208
 209 This function is intended for application code written in C. For
 210 language bindings, see `llvm_blake3_hasher_init_derive_key_raw` below.
 211
 212 ---
 213
 214 ```c
 215 void llvm_blake3_hasher_init_derive_key_raw(
 216   llvm_blake3_hasher *self,
 217   const void *context,
 218   size_t context_len);
 219 ```
 220
 221 As `llvm_blake3_hasher_init_derive_key` above, except that the context string
 222 is given as a pointer to an array of arbitrary bytes with a provided
 223 length. This is intended for writing language bindings, where C string
 224 conversion would add unnecessary overhead and new error cases. Unicode
 225 strings should be encoded as UTF-8.
 226
 227 Application code in C should prefer `llvm_blake3_hasher_init_derive_key`,
 228 which takes the context as a C string. If you need to use arbitrary
 229 bytes as a context string in application code, consider whether you're
 230 violating the requirement that context strings should be hardcoded.
 231
 232 ---
 233
 234 ```c
 235 void llvm_blake3_hasher_finalize_seek(
 236   const llvm_blake3_hasher *self,
 237   uint64_t seek,
 238   uint8_t *out,
 239   size_t out_len);
 240 ```
 241
 242 The same as `llvm_blake3_hasher_finalize`, but with an additional `seek`
 243 parameter for the starting byte position in the output stream. To
 244 efficiently stream a large output without allocating memory, call this
 245 function in a loop, incrementing `seek` by the output length each time.
 246
 247 ---
 248
 249 ```c
 250 void llvm_blake3_hasher_reset(
 251   llvm_blake3_hasher *self);
 252 ```
 253
 254 Reset the hasher to its initial state, prior to any calls to
 255 `llvm_blake3_hasher_update`. Currently this is no different from calling
 256 `llvm_blake3_hasher_init` or similar again. However, if this implementation gains
 257 multithreading support in the future, and if `llvm_blake3_hasher` holds (optional)
 258 threading resources, this function will reuse those resources.
 259
 260
 261 # Building
 262
 263 This implementation is just C and assembly files.
 264
 265 ## x86
 266
 267 Dynamic dispatch is enabled by default on x86. The implementation will
 268 query the CPU at runtime to detect SIMD support, and it will use the
 269 widest instruction set available. By default, `blake3_dispatch.c`
 270 expects to be linked with code for five different instruction sets:
 271 portable C, SSE2, SSE4.1, AVX2, and AVX-512.
 272
 273 For each of the x86 SIMD instruction sets, four versions are available:
 274 three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one
 275 version using C intrinsics. The assembly versions are generally
 276 preferred. They perform better, they perform more consistently across
 277 different compilers, and they build more quickly. On the other hand, the
 278 assembly versions are x86\_64-only, and you need to select the right
 279 flavor for your target platform.
 280
 281 ## ARM NEON
 282
 283 The NEON implementation is enabled by default on AArch64, but not on
 284 other ARM targets, since not all of them support it. To enable it, set
 285 `BLAKE3_USE_NEON=1`.
 286
 287 To explicitiy disable using NEON instructions on AArch64, set
 288 `BLAKE3_USE_NEON=0`.
 289
 290 ## Other Platforms
 291
 292 The portable implementation should work on most other architectures.
 293
 294 # Multithreading
 295
 296 The implementation doesn't currently support multithreading.