2 * QEMU TX packets abstractions
4 * Copyright (c) 2012 Ravello Systems LTD (http://ravellosystems.com)
6 * Developed by Daynix Computing LTD (http://www.daynix.com)
9 * Dmitry Fleytman <dmitry@daynix.com>
10 * Tamir Shomer <tamirs@daynix.com>
11 * Yan Vugenfirer <yan@daynix.com>
13 * This work is licensed under the terms of the GNU GPL, version 2 or later.
14 * See the COPYING file in the top-level directory.
18 #include "qemu/osdep.h"
19 #include "net_tx_pkt.h"
21 #include "net/checksum.h"
24 #include "hw/pci/pci.h"
27 NET_TX_PKT_VHDR_FRAG
= 0,
28 NET_TX_PKT_L2HDR_FRAG
,
29 NET_TX_PKT_L3HDR_FRAG
,
30 NET_TX_PKT_PL_START_FRAG
33 /* TX packet private context */
37 struct virtio_net_hdr virt_hdr
;
42 uint32_t max_raw_frags
;
46 uint8_t l2_hdr
[ETH_MAX_L2_HDR_LEN
];
47 uint8_t l3_hdr
[ETH_MAX_IP_DGRAM_LEN
];
51 uint32_t payload_frags
;
52 uint32_t max_payload_frags
;
55 eth_pkt_types_e packet_type
;
61 void net_tx_pkt_init(struct NetTxPkt
**pkt
, PCIDevice
*pci_dev
,
62 uint32_t max_frags
, bool has_virt_hdr
)
64 struct NetTxPkt
*p
= g_malloc0(sizeof *p
);
68 p
->vec
= g_new(struct iovec
, max_frags
+ NET_TX_PKT_PL_START_FRAG
);
70 p
->raw
= g_new(struct iovec
, max_frags
);
72 p
->max_payload_frags
= max_frags
;
73 p
->max_raw_frags
= max_frags
;
74 p
->has_virt_hdr
= has_virt_hdr
;
75 p
->vec
[NET_TX_PKT_VHDR_FRAG
].iov_base
= &p
->virt_hdr
;
76 p
->vec
[NET_TX_PKT_VHDR_FRAG
].iov_len
=
77 p
->has_virt_hdr
? sizeof p
->virt_hdr
: 0;
78 p
->vec
[NET_TX_PKT_L2HDR_FRAG
].iov_base
= &p
->l2_hdr
;
79 p
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_base
= &p
->l3_hdr
;
84 void net_tx_pkt_uninit(struct NetTxPkt
*pkt
)
93 void net_tx_pkt_update_ip_hdr_checksum(struct NetTxPkt
*pkt
)
97 struct ip_header
*ip_hdr
;
98 ip_hdr
= pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_base
;
100 ip_hdr
->ip_len
= cpu_to_be16(pkt
->payload_len
+
101 pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_len
);
104 csum
= net_raw_checksum((uint8_t *)ip_hdr
,
105 pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_len
);
106 ip_hdr
->ip_sum
= cpu_to_be16(csum
);
109 void net_tx_pkt_update_ip_checksums(struct NetTxPkt
*pkt
)
114 uint8_t gso_type
= pkt
->virt_hdr
.gso_type
& ~VIRTIO_NET_HDR_GSO_ECN
;
115 void *ip_hdr
= pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_base
;
117 if (pkt
->payload_len
+ pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_len
>
118 ETH_MAX_IP_DGRAM_LEN
) {
122 if (gso_type
== VIRTIO_NET_HDR_GSO_TCPV4
||
123 gso_type
== VIRTIO_NET_HDR_GSO_UDP
) {
124 /* Calculate IP header checksum */
125 net_tx_pkt_update_ip_hdr_checksum(pkt
);
127 /* Calculate IP pseudo header checksum */
128 cntr
= eth_calc_ip4_pseudo_hdr_csum(ip_hdr
, pkt
->payload_len
, &cso
);
129 csum
= cpu_to_be16(~net_checksum_finish(cntr
));
130 } else if (gso_type
== VIRTIO_NET_HDR_GSO_TCPV6
) {
131 /* Calculate IP pseudo header checksum */
132 cntr
= eth_calc_ip6_pseudo_hdr_csum(ip_hdr
, pkt
->payload_len
,
134 csum
= cpu_to_be16(~net_checksum_finish(cntr
));
139 iov_from_buf(&pkt
->vec
[NET_TX_PKT_PL_START_FRAG
], pkt
->payload_frags
,
140 pkt
->virt_hdr
.csum_offset
, &csum
, sizeof(csum
));
143 static void net_tx_pkt_calculate_hdr_len(struct NetTxPkt
*pkt
)
145 pkt
->hdr_len
= pkt
->vec
[NET_TX_PKT_L2HDR_FRAG
].iov_len
+
146 pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_len
;
149 static bool net_tx_pkt_parse_headers(struct NetTxPkt
*pkt
)
151 struct iovec
*l2_hdr
, *l3_hdr
;
153 size_t full_ip6hdr_len
;
158 l2_hdr
= &pkt
->vec
[NET_TX_PKT_L2HDR_FRAG
];
159 l3_hdr
= &pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
];
161 bytes_read
= iov_to_buf(pkt
->raw
, pkt
->raw_frags
, 0, l2_hdr
->iov_base
,
163 if (bytes_read
< sizeof(struct eth_header
)) {
168 l2_hdr
->iov_len
= sizeof(struct eth_header
);
169 switch (be16_to_cpu(PKT_GET_ETH_HDR(l2_hdr
->iov_base
)->h_proto
)) {
171 l2_hdr
->iov_len
+= sizeof(struct vlan_header
);
174 l2_hdr
->iov_len
+= 2 * sizeof(struct vlan_header
);
178 if (bytes_read
< l2_hdr
->iov_len
) {
181 pkt
->packet_type
= ETH_PKT_UCAST
;
184 l2_hdr
->iov_len
= ETH_MAX_L2_HDR_LEN
;
185 l2_hdr
->iov_len
= eth_get_l2_hdr_length(l2_hdr
->iov_base
);
186 pkt
->packet_type
= get_eth_packet_type(l2_hdr
->iov_base
);
189 l3_proto
= eth_get_l3_proto(l2_hdr
, 1, l2_hdr
->iov_len
);
193 bytes_read
= iov_to_buf(pkt
->raw
, pkt
->raw_frags
, l2_hdr
->iov_len
,
194 l3_hdr
->iov_base
, sizeof(struct ip_header
));
196 if (bytes_read
< sizeof(struct ip_header
)) {
201 l3_hdr
->iov_len
= IP_HDR_GET_LEN(l3_hdr
->iov_base
);
203 if (l3_hdr
->iov_len
< sizeof(struct ip_header
)) {
208 pkt
->l4proto
= IP_HDR_GET_P(l3_hdr
->iov_base
);
210 if (IP_HDR_GET_LEN(l3_hdr
->iov_base
) != sizeof(struct ip_header
)) {
211 /* copy optional IPv4 header data if any*/
212 bytes_read
= iov_to_buf(pkt
->raw
, pkt
->raw_frags
,
213 l2_hdr
->iov_len
+ sizeof(struct ip_header
),
214 l3_hdr
->iov_base
+ sizeof(struct ip_header
),
215 l3_hdr
->iov_len
- sizeof(struct ip_header
));
216 if (bytes_read
< l3_hdr
->iov_len
- sizeof(struct ip_header
)) {
226 eth_ip6_hdr_info hdrinfo
;
228 if (!eth_parse_ipv6_hdr(pkt
->raw
, pkt
->raw_frags
, l2_hdr
->iov_len
,
234 pkt
->l4proto
= hdrinfo
.l4proto
;
235 full_ip6hdr_len
= hdrinfo
.full_hdr_len
;
237 if (full_ip6hdr_len
> ETH_MAX_IP_DGRAM_LEN
) {
242 bytes_read
= iov_to_buf(pkt
->raw
, pkt
->raw_frags
, l2_hdr
->iov_len
,
243 l3_hdr
->iov_base
, full_ip6hdr_len
);
245 if (bytes_read
< full_ip6hdr_len
) {
249 l3_hdr
->iov_len
= full_ip6hdr_len
;
258 net_tx_pkt_calculate_hdr_len(pkt
);
262 static void net_tx_pkt_rebuild_payload(struct NetTxPkt
*pkt
)
264 pkt
->payload_len
= iov_size(pkt
->raw
, pkt
->raw_frags
) - pkt
->hdr_len
;
265 pkt
->payload_frags
= iov_copy(&pkt
->vec
[NET_TX_PKT_PL_START_FRAG
],
266 pkt
->max_payload_frags
,
267 pkt
->raw
, pkt
->raw_frags
,
268 pkt
->hdr_len
, pkt
->payload_len
);
271 bool net_tx_pkt_parse(struct NetTxPkt
*pkt
)
273 if (net_tx_pkt_parse_headers(pkt
)) {
274 net_tx_pkt_rebuild_payload(pkt
);
281 struct virtio_net_hdr
*net_tx_pkt_get_vhdr(struct NetTxPkt
*pkt
)
284 return &pkt
->virt_hdr
;
287 static uint8_t net_tx_pkt_get_gso_type(struct NetTxPkt
*pkt
,
290 uint8_t rc
= VIRTIO_NET_HDR_GSO_NONE
;
293 l3_proto
= eth_get_l3_proto(&pkt
->vec
[NET_TX_PKT_L2HDR_FRAG
], 1,
294 pkt
->vec
[NET_TX_PKT_L2HDR_FRAG
].iov_len
);
300 rc
= eth_get_gso_type(l3_proto
, pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_base
,
307 void net_tx_pkt_build_vheader(struct NetTxPkt
*pkt
, bool tso_enable
,
308 bool csum_enable
, uint32_t gso_size
)
310 struct tcp_hdr l4hdr
;
313 /* csum has to be enabled if tso is. */
314 assert(csum_enable
|| !tso_enable
);
316 pkt
->virt_hdr
.gso_type
= net_tx_pkt_get_gso_type(pkt
, tso_enable
);
318 switch (pkt
->virt_hdr
.gso_type
& ~VIRTIO_NET_HDR_GSO_ECN
) {
319 case VIRTIO_NET_HDR_GSO_NONE
:
320 pkt
->virt_hdr
.hdr_len
= 0;
321 pkt
->virt_hdr
.gso_size
= 0;
324 case VIRTIO_NET_HDR_GSO_UDP
:
325 pkt
->virt_hdr
.gso_size
= gso_size
;
326 pkt
->virt_hdr
.hdr_len
= pkt
->hdr_len
+ sizeof(struct udp_header
);
329 case VIRTIO_NET_HDR_GSO_TCPV4
:
330 case VIRTIO_NET_HDR_GSO_TCPV6
:
331 iov_to_buf(&pkt
->vec
[NET_TX_PKT_PL_START_FRAG
], pkt
->payload_frags
,
332 0, &l4hdr
, sizeof(l4hdr
));
333 pkt
->virt_hdr
.hdr_len
= pkt
->hdr_len
+ l4hdr
.th_off
* sizeof(uint32_t);
334 pkt
->virt_hdr
.gso_size
= gso_size
;
338 g_assert_not_reached();
342 switch (pkt
->l4proto
) {
344 pkt
->virt_hdr
.flags
= VIRTIO_NET_HDR_F_NEEDS_CSUM
;
345 pkt
->virt_hdr
.csum_start
= pkt
->hdr_len
;
346 pkt
->virt_hdr
.csum_offset
= offsetof(struct tcp_hdr
, th_sum
);
349 pkt
->virt_hdr
.flags
= VIRTIO_NET_HDR_F_NEEDS_CSUM
;
350 pkt
->virt_hdr
.csum_start
= pkt
->hdr_len
;
351 pkt
->virt_hdr
.csum_offset
= offsetof(struct udp_hdr
, uh_sum
);
359 void net_tx_pkt_setup_vlan_header_ex(struct NetTxPkt
*pkt
,
360 uint16_t vlan
, uint16_t vlan_ethtype
)
365 eth_setup_vlan_headers_ex(pkt
->vec
[NET_TX_PKT_L2HDR_FRAG
].iov_base
,
366 vlan
, vlan_ethtype
, &is_new
);
368 /* update l2hdrlen */
370 pkt
->hdr_len
+= sizeof(struct vlan_header
);
371 pkt
->vec
[NET_TX_PKT_L2HDR_FRAG
].iov_len
+=
372 sizeof(struct vlan_header
);
376 bool net_tx_pkt_add_raw_fragment(struct NetTxPkt
*pkt
, hwaddr pa
,
379 hwaddr mapped_len
= 0;
380 struct iovec
*ventry
;
383 if (pkt
->raw_frags
>= pkt
->max_raw_frags
) {
391 ventry
= &pkt
->raw
[pkt
->raw_frags
];
394 ventry
->iov_base
= pci_dma_map(pkt
->pci_dev
, pa
,
395 &mapped_len
, DMA_DIRECTION_TO_DEVICE
);
397 if ((ventry
->iov_base
!= NULL
) && (len
== mapped_len
)) {
398 ventry
->iov_len
= mapped_len
;
406 bool net_tx_pkt_has_fragments(struct NetTxPkt
*pkt
)
408 return pkt
->raw_frags
> 0;
411 eth_pkt_types_e
net_tx_pkt_get_packet_type(struct NetTxPkt
*pkt
)
415 return pkt
->packet_type
;
418 size_t net_tx_pkt_get_total_len(struct NetTxPkt
*pkt
)
422 return pkt
->hdr_len
+ pkt
->payload_len
;
425 void net_tx_pkt_dump(struct NetTxPkt
*pkt
)
427 #ifdef NET_TX_PKT_DEBUG
430 printf("TX PKT: hdr_len: %d, pkt_type: 0x%X, l2hdr_len: %lu, "
431 "l3hdr_len: %lu, payload_len: %u\n", pkt
->hdr_len
, pkt
->packet_type
,
432 pkt
->vec
[NET_TX_PKT_L2HDR_FRAG
].iov_len
,
433 pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_len
, pkt
->payload_len
);
437 void net_tx_pkt_reset(struct NetTxPkt
*pkt
)
441 /* no assert, as reset can be called before tx_pkt_init */
446 memset(&pkt
->virt_hdr
, 0, sizeof(pkt
->virt_hdr
));
450 pkt
->payload_len
= 0;
451 pkt
->payload_frags
= 0;
453 if (pkt
->max_raw_frags
> 0) {
455 for (i
= 0; i
< pkt
->raw_frags
; i
++) {
456 assert(pkt
->raw
[i
].iov_base
);
457 pci_dma_unmap(pkt
->pci_dev
, pkt
->raw
[i
].iov_base
,
458 pkt
->raw
[i
].iov_len
, DMA_DIRECTION_TO_DEVICE
, 0);
467 static void net_tx_pkt_do_sw_csum(struct NetTxPkt
*pkt
)
469 struct iovec
*iov
= &pkt
->vec
[NET_TX_PKT_L2HDR_FRAG
];
473 /* num of iovec without vhdr */
474 uint32_t iov_len
= pkt
->payload_frags
+ NET_TX_PKT_PL_START_FRAG
- 1;
476 size_t csum_offset
= pkt
->virt_hdr
.csum_start
+ pkt
->virt_hdr
.csum_offset
;
477 uint16_t l3_proto
= eth_get_l3_proto(iov
, 1, iov
->iov_len
);
479 /* Put zero to checksum field */
480 iov_from_buf(iov
, iov_len
, csum_offset
, &csum
, sizeof csum
);
482 /* Calculate L4 TCP/UDP checksum */
483 csl
= pkt
->payload_len
;
487 /* add pseudo header to csum */
488 if (l3_proto
== ETH_P_IP
) {
489 csum_cntr
= eth_calc_ip4_pseudo_hdr_csum(
490 pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_base
,
492 } else if (l3_proto
== ETH_P_IPV6
) {
493 csum_cntr
= eth_calc_ip6_pseudo_hdr_csum(
494 pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_base
,
495 csl
, pkt
->l4proto
, &cso
);
500 net_checksum_add_iov(iov
, iov_len
, pkt
->virt_hdr
.csum_start
, csl
, cso
);
502 /* Put the checksum obtained into the packet */
503 csum
= cpu_to_be16(net_checksum_finish_nozero(csum_cntr
));
504 iov_from_buf(iov
, iov_len
, csum_offset
, &csum
, sizeof csum
);
508 NET_TX_PKT_FRAGMENT_L2_HDR_POS
= 0,
509 NET_TX_PKT_FRAGMENT_L3_HDR_POS
,
510 NET_TX_PKT_FRAGMENT_HEADER_NUM
513 #define NET_MAX_FRAG_SG_LIST (64)
515 static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt
*pkt
,
516 int *src_idx
, size_t *src_offset
, struct iovec
*dst
, int *dst_idx
)
519 struct iovec
*src
= pkt
->vec
;
521 *dst_idx
= NET_TX_PKT_FRAGMENT_HEADER_NUM
;
523 while (fetched
< IP_FRAG_ALIGN_SIZE(pkt
->virt_hdr
.gso_size
)) {
525 /* no more place in fragment iov */
526 if (*dst_idx
== NET_MAX_FRAG_SG_LIST
) {
530 /* no more data in iovec */
531 if (*src_idx
== (pkt
->payload_frags
+ NET_TX_PKT_PL_START_FRAG
)) {
536 dst
[*dst_idx
].iov_base
= src
[*src_idx
].iov_base
+ *src_offset
;
537 dst
[*dst_idx
].iov_len
= MIN(src
[*src_idx
].iov_len
- *src_offset
,
538 IP_FRAG_ALIGN_SIZE(pkt
->virt_hdr
.gso_size
) - fetched
);
540 *src_offset
+= dst
[*dst_idx
].iov_len
;
541 fetched
+= dst
[*dst_idx
].iov_len
;
543 if (*src_offset
== src
[*src_idx
].iov_len
) {
554 static inline void net_tx_pkt_sendv(struct NetTxPkt
*pkt
,
555 NetClientState
*nc
, const struct iovec
*iov
, int iov_cnt
)
557 if (pkt
->is_loopback
) {
558 qemu_receive_packet_iov(nc
, iov
, iov_cnt
);
560 qemu_sendv_packet(nc
, iov
, iov_cnt
);
564 static bool net_tx_pkt_do_sw_fragmentation(struct NetTxPkt
*pkt
,
567 struct iovec fragment
[NET_MAX_FRAG_SG_LIST
];
568 size_t fragment_len
= 0;
569 bool more_frags
= false;
571 /* some pointers for shorter code */
572 void *l2_iov_base
, *l3_iov_base
;
573 size_t l2_iov_len
, l3_iov_len
;
574 int src_idx
= NET_TX_PKT_PL_START_FRAG
, dst_idx
;
575 size_t src_offset
= 0;
576 size_t fragment_offset
= 0;
578 l2_iov_base
= pkt
->vec
[NET_TX_PKT_L2HDR_FRAG
].iov_base
;
579 l2_iov_len
= pkt
->vec
[NET_TX_PKT_L2HDR_FRAG
].iov_len
;
580 l3_iov_base
= pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_base
;
581 l3_iov_len
= pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_len
;
584 fragment
[NET_TX_PKT_FRAGMENT_L2_HDR_POS
].iov_base
= l2_iov_base
;
585 fragment
[NET_TX_PKT_FRAGMENT_L2_HDR_POS
].iov_len
= l2_iov_len
;
586 fragment
[NET_TX_PKT_FRAGMENT_L3_HDR_POS
].iov_base
= l3_iov_base
;
587 fragment
[NET_TX_PKT_FRAGMENT_L3_HDR_POS
].iov_len
= l3_iov_len
;
590 /* Put as much data as possible and send */
592 fragment_len
= net_tx_pkt_fetch_fragment(pkt
, &src_idx
, &src_offset
,
595 more_frags
= (fragment_offset
+ fragment_len
< pkt
->payload_len
);
597 eth_setup_ip4_fragmentation(l2_iov_base
, l2_iov_len
, l3_iov_base
,
598 l3_iov_len
, fragment_len
, fragment_offset
, more_frags
);
600 eth_fix_ip4_checksum(l3_iov_base
, l3_iov_len
);
602 net_tx_pkt_sendv(pkt
, nc
, fragment
, dst_idx
);
604 fragment_offset
+= fragment_len
;
606 } while (fragment_len
&& more_frags
);
611 bool net_tx_pkt_send(struct NetTxPkt
*pkt
, NetClientState
*nc
)
615 if (!pkt
->has_virt_hdr
&&
616 pkt
->virt_hdr
.flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) {
617 net_tx_pkt_do_sw_csum(pkt
);
621 * Since underlying infrastructure does not support IP datagrams longer
622 * than 64K we should drop such packets and don't even try to send
624 if (VIRTIO_NET_HDR_GSO_NONE
!= pkt
->virt_hdr
.gso_type
) {
625 if (pkt
->payload_len
>
626 ETH_MAX_IP_DGRAM_LEN
-
627 pkt
->vec
[NET_TX_PKT_L3HDR_FRAG
].iov_len
) {
632 if (pkt
->has_virt_hdr
||
633 pkt
->virt_hdr
.gso_type
== VIRTIO_NET_HDR_GSO_NONE
) {
634 net_tx_pkt_fix_ip6_payload_len(pkt
);
635 net_tx_pkt_sendv(pkt
, nc
, pkt
->vec
,
636 pkt
->payload_frags
+ NET_TX_PKT_PL_START_FRAG
);
640 return net_tx_pkt_do_sw_fragmentation(pkt
, nc
);
643 bool net_tx_pkt_send_loopback(struct NetTxPkt
*pkt
, NetClientState
*nc
)
647 pkt
->is_loopback
= true;
648 res
= net_tx_pkt_send(pkt
, nc
);
649 pkt
->is_loopback
= false;
654 void net_tx_pkt_fix_ip6_payload_len(struct NetTxPkt
*pkt
)
656 struct iovec
*l2
= &pkt
->vec
[NET_TX_PKT_L2HDR_FRAG
];
657 if (eth_get_l3_proto(l2
, 1, l2
->iov_len
) == ETH_P_IPV6
) {
658 struct ip6_header
*ip6
= (struct ip6_header
*) pkt
->l3_hdr
;
660 * TODO: if qemu would support >64K packets - add jumbo option check
661 * something like that:
662 * 'if (ip6->ip6_plen == 0 && !has_jumbo_option(ip6)) {'
664 if (ip6
->ip6_plen
== 0) {
665 if (pkt
->payload_len
<= ETH_MAX_IP_DGRAM_LEN
) {
666 ip6
->ip6_plen
= htons(pkt
->payload_len
);
669 * TODO: if qemu would support >64K packets
670 * add jumbo option for packets greater then 65,535 bytes