2 * Ones' complement checksum test & benchmark
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
19 #include "../include/networking.h"
24 #define Assert(exp) assert(exp)
26 #define Assert(exp) (void) (exp)
30 #define may_alias __attribute__((__may_alias__))
36 #define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
38 /* Reference implementation - do not modify! */
40 checksum_simple(const void *ptr
, uint32_t nbytes
)
42 const uint16_t *may_alias hptr
= ptr
;
43 uint64_t sum
= 0;/* Need 64-bit accumulator when nbytes > 64K */
45 /* Sum all halfwords, assume misaligned accesses are handled in HW */
46 for (uint32_t nhalfs
= nbytes
>> 1; nhalfs
!= 0; nhalfs
--)
51 /* Add any trailing odd byte */
52 if ((nbytes
& 0x01) != 0)
54 sum
+= *(uint8_t *) hptr
;
57 /* Fold 64-bit sum to 32 bits */
58 sum
= (sum
& 0xffffffff) + (sum
>> 32);
59 sum
= (sum
& 0xffffffff) + (sum
>> 32);
60 Assert(sum
== (uint32_t) sum
);
62 /* Fold 32-bit sum to 16 bits */
63 sum
= (sum
& 0xffff) + (sum
>> 16);
64 sum
= (sum
& 0xffff) + (sum
>> 16);
65 Assert(sum
== (uint16_t) sum
);
67 return (uint16_t) sum
;
72 uint16_t (*cksum_fp
)(const void *, uint32_t);
76 { checksum_simple
, "simple"},
77 { __chksum
, "scalar"},
79 { __chksum_arm_simd
, "simd" },
81 { __chksum_aarch64_simd
, "simd" },
87 find_impl(const char *name
)
89 for (int i
= 0; implementations
[i
].name
!= NULL
; i
++)
91 if (strcmp(implementations
[i
].name
, name
) == 0)
99 static uint16_t (*CKSUM_FP
)(const void *, uint32_t);
100 static volatile uint16_t SINK
;
103 verify(const void *data
, uint32_t offset
, uint32_t size
)
106 uint16_t csum_expected
= checksum_simple(data
, size
);
107 uint16_t csum_actual
= CKSUM_FP(data
, size
);
108 if (csum_actual
!= csum_expected
)
110 fprintf(stderr
, "\nInvalid checksum for offset %u size %u: "
111 "actual %04x expected %04x (valid)",
112 offset
, size
, csum_actual
, csum_expected
);
118 /* Else some implementations only support sizes up to 2^16 */
128 clock_gettime(CLOCK_MONOTONIC
, &ts
);
129 return ts
.tv_sec
* (uint64_t) 1000000000 + ts
.tv_nsec
;
133 benchmark(const uint8_t *base
,
139 printf("%11u ", (unsigned int) blksize
); fflush(stdout
);
141 uint64_t start
= clock_get_ns();
142 for (uint32_t i
= 0; i
< numops
; i
++)
144 /* Read a random value from the pool */
145 uint32_t random
= ((uint32_t *) base
)[i
% (poolsize
/ 4)];
146 /* Generate a random starting address */
147 const void *data
= &base
[random
% (poolsize
- blksize
)];
148 SINK
= CKSUM_FP(data
, blksize
);
150 uint64_t end
= clock_get_ns();
152 #define MEGABYTE 1000000 /* Decimal megabyte (MB) */
153 uint64_t elapsed_ns
= end
- start
;
154 uint64_t elapsed_ms
= elapsed_ns
/ 1000000;
155 uint32_t blks_per_s
= (uint32_t) ((numops
/ elapsed_ms
) * 1000);
156 uint64_t accbytes
= (uint64_t) numops
* blksize
;
157 printf("%11ju ", (uintmax_t) ((accbytes
/ elapsed_ms
) * 1000) / MEGABYTE
);
158 unsigned int cyc_per_blk
= cpufreq
/ blks_per_s
;
159 printf("%11u ", cyc_per_blk
);
162 unsigned int cyc_per_byte
= 1000 * cyc_per_blk
/ blksize
;
164 cyc_per_byte
/ 1000, cyc_per_byte
% 1000);
169 int main(int argc
, char *argv
[])
173 uint32_t IMPL
= 0;/* Simple implementation */
174 uint64_t CPUFREQ
= 0;
175 uint32_t BLKSIZE
= 0;
176 uint32_t NUMOPS
= 1000000;
177 uint32_t POOLSIZE
= 512 * 1024;/* Typical ARM L2 cache size */
179 setvbuf(stdout
, NULL
, _IOLBF
, 160);
180 while ((c
= getopt(argc
, argv
, "b:df:i:n:p:")) != -1)
186 int blksize
= atoi(optarg
);
187 if (blksize
< 1 || blksize
> POOLSIZE
/ 2)
189 fprintf(stderr
, "Invalid block size %d\n", blksize
);
192 BLKSIZE
= (unsigned) blksize
;
200 int64_t cpufreq
= atoll(optarg
);
203 fprintf(stderr
, "Invalid CPU frequency %"PRId64
"\n",
212 int impl
= find_impl(optarg
);
215 fprintf(stderr
, "Invalid implementation %s\n", optarg
);
218 IMPL
= (unsigned) impl
;
223 int numops
= atoi(optarg
);
226 fprintf(stderr
, "Invalid number of operations %d\n", numops
);
229 NUMOPS
= (unsigned) numops
;
234 int poolsize
= atoi(optarg
);
237 fprintf(stderr
, "Invalid pool size %d\n", poolsize
);
240 char c
= optarg
[strlen(optarg
) - 1];
243 POOLSIZE
= (unsigned) poolsize
* 1024 * 1024;
247 POOLSIZE
= (unsigned) poolsize
* 1024;
251 POOLSIZE
= (unsigned) poolsize
;
257 fprintf(stderr
, "Usage: checksum <options>\n"
258 "-b <blksize> Block size\n"
259 "-d Dump first 96 bytes of data\n"
260 "-f <cpufreq> CPU frequency (Hz)\n"
261 "-i <impl> Implementation\n"
262 "-n <numops> Number of operations\n"
263 "-p <poolsize> Pool size (K or M suffix)\n"
265 printf("Implementations:");
266 for (int i
= 0; implementations
[i
].name
!= NULL
; i
++)
268 printf(" %s", implementations
[i
].name
);
279 CKSUM_FP
= implementations
[IMPL
].cksum_fp
;
280 POOLSIZE
= ALIGN(POOLSIZE
, CACHE_LINE
);
281 uint8_t *base
= mmap(0, POOLSIZE
, PROT_READ
|PROT_WRITE
,
282 MAP_PRIVATE
|MAP_ANONYMOUS
, -1, 0);
283 if (base
== MAP_FAILED
)
285 perror("aligned_alloc"), exit(EXIT_FAILURE
);
287 for (size_t i
= 0; i
< POOLSIZE
/ 4; i
++)
289 ((uint32_t *) base
)[i
] = rand();
292 printf("Implementation: %s\n", implementations
[IMPL
].name
);
293 printf("numops %u, poolsize ", NUMOPS
);
294 if (POOLSIZE
% (1024 * 1024) == 0)
296 printf("%uMiB", POOLSIZE
/ (1024 * 1024));
298 else if (POOLSIZE
% 1024 == 0)
300 printf("%uKiB", POOLSIZE
/ 1024);
304 printf("%uB", POOLSIZE
);
306 printf(", blocksize %u, CPU frequency %juMHz\n",
307 BLKSIZE
, (uintmax_t) (CPUFREQ
/ 1000000));
309 printf("Warning: assertions are enabled\n");
314 /* Print out first 96 bytes of data for human debugging */
315 for (int i
= 0; i
< 96; i
++)
319 printf(" %02x", base
[i
]);
325 /* Verify that chosen algorithm handles all combinations of offsets and sizes */
326 printf("Verifying..."); fflush(stdout
);
328 /* Check all (relevant) combinations of size and offset */
329 for (int size
= 0; size
<= 256; size
++)
331 for (int offset
= 0; offset
< 255; offset
++)
333 /* Check at start of mapped memory */
334 success
&= verify(&base
[offset
], offset
, size
);
335 /* Check at end of mapped memory */
336 uint8_t *p
= base
+ POOLSIZE
- (size
+ offset
);
337 success
&= verify(p
, (uintptr_t) p
% 64, size
);
340 /* Check increasingly larger sizes */
341 for (size_t size
= 1; size
< POOLSIZE
; size
*= 2)
343 success
&= verify(base
, 0, size
);
345 /* Check the full size, this can detect accumulator overflows */
346 success
&= verify(base
, 0, POOLSIZE
);
347 printf("%s\n", success
? "OK" : "failure");
349 /* Print throughput in decimal megabyte (1000000B) per second */
352 printf("%11s %11s %11s %11s\n",
353 "block size", "MB/s", "cycles/blk", "cycles/byte");
357 printf("%11s %11s %11s %11s\n",
358 "block size", "MB/s", "ns/blk", "ns/byte");
359 CPUFREQ
= 1000000000;
363 benchmark(base
, POOLSIZE
, BLKSIZE
, NUMOPS
, CPUFREQ
);
367 static const uint16_t sizes
[] =
368 { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
369 for (int i
= 0; sizes
[i
] != 0; i
++)
371 uint32_t numops
= NUMOPS
* 10000 / (40 + sizes
[i
]);
372 benchmark(base
, POOLSIZE
, sizes
[i
], numops
, CPUFREQ
);
376 if (munmap(base
, POOLSIZE
) != 0)
378 perror("munmap"), exit(EXIT_FAILURE
);
381 return success
? EXIT_SUCCESS
: EXIT_FAILURE
;