1 // SPDX-License-Identifier: GPL-2.0
4 * 1. Read packets and their rx_hash using PF_PACKET/TPACKET_V3
5 * 2. Compute the rx_hash in software based on the packet contents
8 * Optionally, either '-C $rx_irq_cpu_list' or '-r $rps_bitmap' may be given.
10 * If '-C $rx_irq_cpu_list' is given, also
12 * 4. Identify the cpu on which the packet arrived with PACKET_FANOUT_CPU
13 * 5. Compute the rxqueue that RSS would select based on this rx_hash
14 * 6. Using the $rx_irq_cpu_list map, identify the arriving cpu based on rxq irq
15 * 7. Compare the cpus from 4 and 6
17 * Else if '-r $rps_bitmap' is given, also
19 * 4. Identify the cpu on which the packet arrived with PACKET_FANOUT_CPU
20 * 5. Compute the cpu that RPS should select based on rx_hash and $rps_bitmap
21 * 6. Compare the cpus from 4 and 5
26 #include <arpa/inet.h>
31 #include <linux/filter.h>
32 #include <linux/if_ether.h>
33 #include <linux/if_packet.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <netinet/tcp.h>
39 #include <netinet/udp.h>
48 #include <sys/socket.h>
50 #include <sys/sysinfo.h>
52 #include <sys/types.h>
55 #include "../kselftest.h"
57 #define TOEPLITZ_KEY_MIN_LEN 40
58 #define TOEPLITZ_KEY_MAX_LEN 60
60 #define TOEPLITZ_STR_LEN(K) (((K) * 3) - 1) /* hex encoded: AA:BB:CC:...:ZZ */
61 #define TOEPLITZ_STR_MIN_LEN TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MIN_LEN)
62 #define TOEPLITZ_STR_MAX_LEN TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MAX_LEN)
64 #define FOUR_TUPLE_MAX_LEN ((sizeof(struct in6_addr) * 2) + (sizeof(uint16_t) * 2))
66 #define RSS_MAX_CPUS (1 << 16) /* real constraint is PACKET_FANOUT_MAX */
68 #define RPS_MAX_CPUS 16UL /* must be a power of 2 */
70 /* configuration options (cmdline arguments) */
71 static uint16_t cfg_dport
= 8000;
72 static int cfg_family
= AF_INET6
;
73 static char *cfg_ifname
= "eth0";
74 static int cfg_num_queues
;
75 static int cfg_num_rps_cpus
;
77 static int cfg_type
= SOCK_STREAM
;
78 static int cfg_timeout_msec
= 1000;
79 static bool cfg_verbose
;
83 static int ring_block_nr
;
84 static int ring_block_sz
;
87 static int frames_received
;
88 static int frames_nohash
;
89 static int frames_error
;
91 #define log_verbose(args...) do { if (cfg_verbose) fprintf(stderr, args); } while (0)
101 static unsigned int rx_irq_cpus
[RSS_MAX_CPUS
]; /* map from rxq to cpu */
102 static int rps_silo_to_cpu
[RPS_MAX_CPUS
];
103 static unsigned char toeplitz_key
[TOEPLITZ_KEY_MAX_LEN
];
104 static struct ring_state rings
[RSS_MAX_CPUS
];
106 static inline uint32_t toeplitz(const unsigned char *four_tuple
,
107 const unsigned char *key
)
112 key32
= ntohl(*((uint32_t *)key
));
115 for (i
= 0; i
< FOUR_TUPLE_MAX_LEN
; i
++) {
116 for (bit
= 7; bit
>= 0; bit
--) {
117 if (four_tuple
[i
] & (1 << bit
))
121 key32
|= !!(key
[0] & (1 << bit
));
129 /* Compare computed cpu with arrival cpu from packet_fanout_cpu */
130 static void verify_rss(uint32_t rx_hash
, int cpu
)
132 int queue
= rx_hash
% cfg_num_queues
;
134 log_verbose(" rxq %d (cpu %d)", queue
, rx_irq_cpus
[queue
]);
135 if (rx_irq_cpus
[queue
] != cpu
) {
136 log_verbose(". error: rss cpu mismatch (%d)", cpu
);
141 static void verify_rps(uint64_t rx_hash
, int cpu
)
143 int silo
= (rx_hash
* cfg_num_rps_cpus
) >> 32;
145 log_verbose(" silo %d (cpu %d)", silo
, rps_silo_to_cpu
[silo
]);
146 if (rps_silo_to_cpu
[silo
] != cpu
) {
147 log_verbose(". error: rps cpu mismatch (%d)", cpu
);
152 static void log_rxhash(int cpu
, uint32_t rx_hash
,
153 const char *addrs
, int addr_len
)
155 char saddr
[INET6_ADDRSTRLEN
], daddr
[INET6_ADDRSTRLEN
];
158 if (!inet_ntop(cfg_family
, addrs
, saddr
, sizeof(saddr
)) ||
159 !inet_ntop(cfg_family
, addrs
+ addr_len
, daddr
, sizeof(daddr
)))
160 error(1, 0, "address parse error");
162 ports
= (void *)addrs
+ (addr_len
* 2);
163 log_verbose("cpu %d: rx_hash 0x%08x [saddr %s daddr %s sport %02hu dport %02hu]",
164 cpu
, rx_hash
, saddr
, daddr
,
165 ntohs(ports
[0]), ntohs(ports
[1]));
168 /* Compare computed rxhash with rxhash received from tpacket_v3 */
169 static void verify_rxhash(const char *pkt
, uint32_t rx_hash
, int cpu
)
171 unsigned char four_tuple
[FOUR_TUPLE_MAX_LEN
] = {0};
176 if (cfg_family
== AF_INET
) {
177 addr_len
= sizeof(struct in_addr
);
178 addrs
= pkt
+ offsetof(struct iphdr
, saddr
);
180 addr_len
= sizeof(struct in6_addr
);
181 addrs
= pkt
+ offsetof(struct ip6_hdr
, ip6_src
);
184 memcpy(four_tuple
, addrs
, (addr_len
* 2) + (sizeof(uint16_t) * 2));
185 rx_hash_sw
= toeplitz(four_tuple
, toeplitz_key
);
188 log_rxhash(cpu
, rx_hash
, addrs
, addr_len
);
190 if (rx_hash
!= rx_hash_sw
) {
191 log_verbose(" != expected 0x%x\n", rx_hash_sw
);
198 verify_rss(rx_hash
, cpu
);
199 else if (cfg_num_rps_cpus
)
200 verify_rps(rx_hash
, cpu
);
204 static char *recv_frame(const struct ring_state
*ring
, char *frame
)
206 struct tpacket3_hdr
*hdr
= (void *)frame
;
208 if (hdr
->hv1
.tp_rxhash
)
209 verify_rxhash(frame
+ hdr
->tp_net
, hdr
->hv1
.tp_rxhash
,
214 return frame
+ hdr
->tp_next_offset
;
217 /* A single TPACKET_V3 block can hold multiple frames */
218 static bool recv_block(struct ring_state
*ring
)
220 struct tpacket_block_desc
*block
;
224 block
= (void *)(ring
->mmap
+ ring
->idx
* ring_block_sz
);
225 if (!(block
->hdr
.bh1
.block_status
& TP_STATUS_USER
))
228 frame
= (char *)block
;
229 frame
+= block
->hdr
.bh1
.offset_to_first_pkt
;
231 for (i
= 0; i
< block
->hdr
.bh1
.num_pkts
; i
++) {
232 frame
= recv_frame(ring
, frame
);
236 block
->hdr
.bh1
.block_status
= TP_STATUS_KERNEL
;
237 ring
->idx
= (ring
->idx
+ 1) % ring_block_nr
;
242 /* simple test: sleep once unconditionally and then process all rings */
243 static void process_rings(void)
247 usleep(1000 * cfg_timeout_msec
);
249 for (i
= 0; i
< num_cpus
; i
++)
250 do {} while (recv_block(&rings
[i
]));
252 fprintf(stderr
, "count: pass=%u nohash=%u fail=%u\n",
253 frames_received
- frames_nohash
- frames_error
,
254 frames_nohash
, frames_error
);
257 static char *setup_ring(int fd
)
259 struct tpacket_req3 req3
= {0};
262 req3
.tp_retire_blk_tov
= cfg_timeout_msec
/ 8;
263 req3
.tp_feature_req_word
= TP_FT_REQ_FILL_RXHASH
;
265 req3
.tp_frame_size
= 2048;
266 req3
.tp_frame_nr
= 1 << 10;
267 req3
.tp_block_nr
= 16;
269 req3
.tp_block_size
= req3
.tp_frame_size
* req3
.tp_frame_nr
;
270 req3
.tp_block_size
/= req3
.tp_block_nr
;
272 if (setsockopt(fd
, SOL_PACKET
, PACKET_RX_RING
, &req3
, sizeof(req3
)))
273 error(1, errno
, "setsockopt PACKET_RX_RING");
275 ring_block_sz
= req3
.tp_block_size
;
276 ring_block_nr
= req3
.tp_block_nr
;
278 ring
= mmap(0, req3
.tp_block_size
* req3
.tp_block_nr
,
279 PROT_READ
| PROT_WRITE
,
280 MAP_SHARED
| MAP_LOCKED
| MAP_POPULATE
, fd
, 0);
281 if (ring
== MAP_FAILED
)
282 error(1, 0, "mmap failed");
287 static void __set_filter(int fd
, int off_proto
, uint8_t proto
, int off_dport
)
289 struct sock_filter filter
[] = {
290 BPF_STMT(BPF_LD
+ BPF_B
+ BPF_ABS
, SKF_AD_OFF
+ SKF_AD_PKTTYPE
),
291 BPF_JUMP(BPF_JMP
+ BPF_JEQ
+ BPF_K
, PACKET_HOST
, 0, 4),
292 BPF_STMT(BPF_LD
+ BPF_B
+ BPF_ABS
, off_proto
),
293 BPF_JUMP(BPF_JMP
+ BPF_JEQ
+ BPF_K
, proto
, 0, 2),
294 BPF_STMT(BPF_LD
+ BPF_H
+ BPF_ABS
, off_dport
),
295 BPF_JUMP(BPF_JMP
+ BPF_JEQ
+ BPF_K
, cfg_dport
, 1, 0),
296 BPF_STMT(BPF_RET
+ BPF_K
, 0),
297 BPF_STMT(BPF_RET
+ BPF_K
, 0xFFFF),
299 struct sock_fprog prog
= {};
301 prog
.filter
= filter
;
302 prog
.len
= ARRAY_SIZE(filter
);
303 if (setsockopt(fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &prog
, sizeof(prog
)))
304 error(1, errno
, "setsockopt filter");
307 /* filter on transport protocol and destination port */
308 static void set_filter(int fd
)
310 const int off_dport
= offsetof(struct tcphdr
, dest
); /* same for udp */
313 proto
= cfg_type
== SOCK_STREAM
? IPPROTO_TCP
: IPPROTO_UDP
;
314 if (cfg_family
== AF_INET
)
315 __set_filter(fd
, offsetof(struct iphdr
, protocol
), proto
,
316 sizeof(struct iphdr
) + off_dport
);
318 __set_filter(fd
, offsetof(struct ip6_hdr
, ip6_nxt
), proto
,
319 sizeof(struct ip6_hdr
) + off_dport
);
322 /* drop everything: used temporarily during setup */
323 static void set_filter_null(int fd
)
325 struct sock_filter filter
[] = {
326 BPF_STMT(BPF_RET
+ BPF_K
, 0),
328 struct sock_fprog prog
= {};
330 prog
.filter
= filter
;
331 prog
.len
= ARRAY_SIZE(filter
);
332 if (setsockopt(fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &prog
, sizeof(prog
)))
333 error(1, errno
, "setsockopt filter");
336 static int create_ring(char **ring
)
338 struct fanout_args args
= {
340 .type_flags
= PACKET_FANOUT_CPU
,
341 .max_num_members
= RSS_MAX_CPUS
343 struct sockaddr_ll ll
= { 0 };
346 fd
= socket(PF_PACKET
, SOCK_DGRAM
, 0);
348 error(1, errno
, "socket creation failed");
351 if (setsockopt(fd
, SOL_PACKET
, PACKET_VERSION
, &val
, sizeof(val
)))
352 error(1, errno
, "setsockopt PACKET_VERSION");
353 *ring
= setup_ring(fd
);
355 /* block packets until all rings are added to the fanout group:
356 * else packets can arrive during setup and get misclassified
360 ll
.sll_family
= AF_PACKET
;
361 ll
.sll_ifindex
= if_nametoindex(cfg_ifname
);
362 ll
.sll_protocol
= cfg_family
== AF_INET
? htons(ETH_P_IP
) :
364 if (bind(fd
, (void *)&ll
, sizeof(ll
)))
365 error(1, errno
, "bind");
367 /* must come after bind: verifies all programs in group match */
368 if (setsockopt(fd
, SOL_PACKET
, PACKET_FANOUT
, &args
, sizeof(args
))) {
369 /* on failure, retry using old API if that is sufficient:
370 * it has a hard limit of 256 sockets, so only try if
371 * (a) only testing rxhash, not RSS or (b) <= 256 cpus.
372 * in this API, the third argument is left implicit.
374 if (cfg_num_queues
|| num_cpus
> 256 ||
375 setsockopt(fd
, SOL_PACKET
, PACKET_FANOUT
,
376 &args
, sizeof(uint32_t)))
377 error(1, errno
, "setsockopt PACKET_FANOUT cpu");
383 /* setup inet(6) socket to blackhole the test traffic, if arg '-s' */
384 static int setup_sink(void)
388 fd
= socket(cfg_family
, cfg_type
, 0);
390 error(1, errno
, "socket %d.%d", cfg_family
, cfg_type
);
393 if (setsockopt(fd
, SOL_SOCKET
, SO_RCVBUFFORCE
, &val
, sizeof(val
)))
394 error(1, errno
, "setsockopt rcvbuf");
399 static void setup_rings(void)
403 for (i
= 0; i
< num_cpus
; i
++) {
405 rings
[i
].fd
= create_ring(&rings
[i
].mmap
);
408 /* accept packets once all rings in the fanout group are up */
409 for (i
= 0; i
< num_cpus
; i
++)
410 set_filter(rings
[i
].fd
);
413 static void cleanup_rings(void)
417 for (i
= 0; i
< num_cpus
; i
++) {
418 if (munmap(rings
[i
].mmap
, ring_block_nr
* ring_block_sz
))
419 error(1, errno
, "munmap");
420 if (close(rings
[i
].fd
))
421 error(1, errno
, "close");
425 static void parse_cpulist(const char *arg
)
428 rx_irq_cpus
[cfg_num_queues
++] = strtol(arg
, NULL
, 10);
430 arg
= strchr(arg
, ',');
437 static void show_cpulist(void)
441 for (i
= 0; i
< cfg_num_queues
; i
++)
442 fprintf(stderr
, "rxq %d: cpu %d\n", i
, rx_irq_cpus
[i
]);
445 static void show_silos(void)
449 for (i
= 0; i
< cfg_num_rps_cpus
; i
++)
450 fprintf(stderr
, "silo %d: cpu %d\n", i
, rps_silo_to_cpu
[i
]);
453 static void parse_toeplitz_key(const char *str
, int slen
, unsigned char *key
)
457 if (slen
< TOEPLITZ_STR_MIN_LEN
||
458 slen
> TOEPLITZ_STR_MAX_LEN
+ 1)
459 error(1, 0, "invalid toeplitz key");
461 for (i
= 0, off
= 0; off
< slen
; i
++, off
+= 3) {
462 ret
= sscanf(str
+ off
, "%hhx", &key
[i
]);
464 error(1, 0, "key parse error at %d off %d len %d",
469 static void parse_rps_bitmap(const char *arg
)
471 unsigned long bitmap
;
474 bitmap
= strtoul(arg
, NULL
, 0);
476 if (bitmap
& ~(RPS_MAX_CPUS
- 1))
477 error(1, 0, "rps bitmap 0x%lx out of bounds 0..%lu",
478 bitmap
, RPS_MAX_CPUS
- 1);
480 for (i
= 0; i
< RPS_MAX_CPUS
; i
++)
481 if (bitmap
& 1UL << i
)
482 rps_silo_to_cpu
[cfg_num_rps_cpus
++] = i
;
485 static void parse_opts(int argc
, char **argv
)
487 static struct option long_options
[] = {
488 {"dport", required_argument
, 0, 'd'},
489 {"cpus", required_argument
, 0, 'C'},
490 {"key", required_argument
, 0, 'k'},
491 {"iface", required_argument
, 0, 'i'},
492 {"ipv4", no_argument
, 0, '4'},
493 {"ipv6", no_argument
, 0, '6'},
494 {"sink", no_argument
, 0, 's'},
495 {"tcp", no_argument
, 0, 't'},
496 {"timeout", required_argument
, 0, 'T'},
497 {"udp", no_argument
, 0, 'u'},
498 {"verbose", no_argument
, 0, 'v'},
499 {"rps", required_argument
, 0, 'r'},
502 bool have_toeplitz
= false;
505 while ((c
= getopt_long(argc
, argv
, "46C:d:i:k:r:stT:uv", long_options
, &index
)) != -1) {
508 cfg_family
= AF_INET
;
511 cfg_family
= AF_INET6
;
514 parse_cpulist(optarg
);
517 cfg_dport
= strtol(optarg
, NULL
, 0);
523 parse_toeplitz_key(optarg
, strlen(optarg
),
525 have_toeplitz
= true;
528 parse_rps_bitmap(optarg
);
534 cfg_type
= SOCK_STREAM
;
537 cfg_timeout_msec
= strtol(optarg
, NULL
, 0);
540 cfg_type
= SOCK_DGRAM
;
547 error(1, 0, "unknown option %c", optopt
);
553 error(1, 0, "Must supply rss key ('-k')");
555 num_cpus
= get_nprocs();
556 if (num_cpus
> RSS_MAX_CPUS
)
557 error(1, 0, "increase RSS_MAX_CPUS");
559 if (cfg_num_queues
&& cfg_num_rps_cpus
)
561 "Can't supply both RSS cpus ('-C') and RPS map ('-r')");
568 int main(int argc
, char **argv
)
570 const int min_tests
= 10;
573 parse_opts(argc
, argv
);
576 fd_sink
= setup_sink();
582 if (cfg_sink
&& close(fd_sink
))
583 error(1, errno
, "close sink");
585 if (frames_received
- frames_nohash
< min_tests
)
586 error(1, 0, "too few frames for verification");