4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
33 #include <sys/sunddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/modctl.h>
38 #include <sys/mac_impl.h> /* For mac_fix_cksum(). */
40 #include <sys/strsubr.h>
41 #include <sys/strsun.h>
42 #include <sys/types.h>
43 #include <sys/pattr.h>
44 #include <vm/seg_kmem.h>
45 #include <vm/hat_i86.h>
46 #include <xen/sys/xenbus_impl.h>
47 #include <xen/sys/xendev.h>
48 #include <sys/balloon_impl.h>
49 #include <sys/evtchn_impl.h>
50 #include <sys/gnttab.h>
51 #include <vm/vm_dep.h>
55 #include <inet/ip_impl.h>
58 * The terms "transmit" and "receive" are used in alignment with domU,
59 * which means that packets originating from the peer domU are "transmitted"
60 * to other parts of the system and packets are "received" from them.
64 * Should we allow guests to manipulate multicast group membership?
66 static boolean_t xnb_multicast_control
= B_TRUE
;
68 static boolean_t
xnb_connect_rings(dev_info_t
*);
69 static void xnb_disconnect_rings(dev_info_t
*);
70 static void xnb_oe_state_change(dev_info_t
*, ddi_eventcookie_t
,
72 static void xnb_hp_state_change(dev_info_t
*, ddi_eventcookie_t
,
75 static int xnb_txbuf_constructor(void *, void *, int);
76 static void xnb_txbuf_destructor(void *, void *);
77 static void xnb_tx_notify_peer(xnb_t
*, boolean_t
);
78 static void xnb_tx_mark_complete(xnb_t
*, RING_IDX
, int16_t);
80 mblk_t
*xnb_to_peer(xnb_t
*, mblk_t
*);
81 mblk_t
*xnb_copy_to_peer(xnb_t
*, mblk_t
*);
83 static void setup_gop(xnb_t
*, gnttab_copy_t
*, uchar_t
*,
84 size_t, size_t, size_t, grant_ref_t
);
85 #pragma inline(setup_gop)
86 static boolean_t
is_foreign(void *);
87 #pragma inline(is_foreign)
89 #define INVALID_GRANT_HANDLE ((grant_handle_t)-1)
90 #define INVALID_GRANT_REF ((grant_ref_t)-1)
92 static kmutex_t xnb_alloc_page_lock
;
95 * On a 32 bit PAE system physical and machine addresses are larger
96 * than 32 bits. ddi_btop() on such systems take an unsigned long
97 * argument, and so addresses above 4G are truncated before ddi_btop()
98 * gets to see them. To avoid this, code the shift operation here.
100 #define xnb_btop(addr) ((addr) >> PAGESHIFT)
102 /* DMA attributes for transmit and receive data */
103 static ddi_dma_attr_t buf_dma_attr
= {
104 DMA_ATTR_V0
, /* version of this structure */
105 0, /* lowest usable address */
106 0xffffffffffffffffULL
, /* highest usable address */
107 0x7fffffff, /* maximum DMAable byte count */
108 MMU_PAGESIZE
, /* alignment in bytes */
109 0x7ff, /* bitmap of burst sizes */
110 1, /* minimum transfer */
111 0xffffffffU
, /* maximum transfer */
112 0xffffffffffffffffULL
, /* maximum segment length */
113 1, /* maximum number of segments */
115 0, /* flags (reserved) */
118 /* DMA access attributes for data: NOT to be byte swapped. */
119 static ddi_device_acc_attr_t data_accattr
= {
128 static const char * const aux_statistics
[] = {
132 "tx_notify_deferred",
134 "rx_notify_deferred",
143 "allocation_success",
144 "allocation_failure",
145 "small_allocation_success",
146 "small_allocation_failure",
147 "other_allocation_failure",
148 "rx_pageboundary_crossed",
153 "tx_unexpected_flags",
157 xnb_ks_aux_update(kstat_t
*ksp
, int flag
)
162 if (flag
!= KSTAT_READ
)
165 xnbp
= ksp
->ks_private
;
169 * Assignment order should match that of the names in
172 (knp
++)->value
.ui64
= xnbp
->xnb_stat_rx_cksum_deferred
;
173 (knp
++)->value
.ui64
= xnbp
->xnb_stat_tx_cksum_no_need
;
174 (knp
++)->value
.ui64
= xnbp
->xnb_stat_rx_rsp_notok
;
175 (knp
++)->value
.ui64
= xnbp
->xnb_stat_tx_notify_deferred
;
176 (knp
++)->value
.ui64
= xnbp
->xnb_stat_tx_notify_sent
;
177 (knp
++)->value
.ui64
= xnbp
->xnb_stat_rx_notify_deferred
;
178 (knp
++)->value
.ui64
= xnbp
->xnb_stat_rx_notify_sent
;
179 (knp
++)->value
.ui64
= xnbp
->xnb_stat_tx_too_early
;
180 (knp
++)->value
.ui64
= xnbp
->xnb_stat_rx_too_early
;
181 (knp
++)->value
.ui64
= xnbp
->xnb_stat_rx_allocb_failed
;
182 (knp
++)->value
.ui64
= xnbp
->xnb_stat_tx_allocb_failed
;
183 (knp
++)->value
.ui64
= xnbp
->xnb_stat_rx_foreign_page
;
184 (knp
++)->value
.ui64
= xnbp
->xnb_stat_mac_full
;
185 (knp
++)->value
.ui64
= xnbp
->xnb_stat_spurious_intr
;
186 (knp
++)->value
.ui64
= xnbp
->xnb_stat_allocation_success
;
187 (knp
++)->value
.ui64
= xnbp
->xnb_stat_allocation_failure
;
188 (knp
++)->value
.ui64
= xnbp
->xnb_stat_small_allocation_success
;
189 (knp
++)->value
.ui64
= xnbp
->xnb_stat_small_allocation_failure
;
190 (knp
++)->value
.ui64
= xnbp
->xnb_stat_other_allocation_failure
;
191 (knp
++)->value
.ui64
= xnbp
->xnb_stat_rx_pagebndry_crossed
;
192 (knp
++)->value
.ui64
= xnbp
->xnb_stat_rx_cpoparea_grown
;
193 (knp
++)->value
.ui64
= xnbp
->xnb_stat_csum_hardware
;
194 (knp
++)->value
.ui64
= xnbp
->xnb_stat_csum_software
;
195 (knp
++)->value
.ui64
= xnbp
->xnb_stat_tx_overflow_page
;
196 (knp
++)->value
.ui64
= xnbp
->xnb_stat_tx_unexpected_flags
;
202 xnb_ks_init(xnb_t
*xnbp
)
204 int nstat
= sizeof (aux_statistics
) /
205 sizeof (aux_statistics
[0]);
206 const char * const *cp
= aux_statistics
;
210 * Create and initialise kstats.
212 xnbp
->xnb_kstat_aux
= kstat_create(ddi_driver_name(xnbp
->xnb_devinfo
),
213 ddi_get_instance(xnbp
->xnb_devinfo
), "aux_statistics", "net",
214 KSTAT_TYPE_NAMED
, nstat
, 0);
215 if (xnbp
->xnb_kstat_aux
== NULL
)
218 xnbp
->xnb_kstat_aux
->ks_private
= xnbp
;
219 xnbp
->xnb_kstat_aux
->ks_update
= xnb_ks_aux_update
;
221 knp
= xnbp
->xnb_kstat_aux
->ks_data
;
223 kstat_named_init(knp
, *cp
, KSTAT_DATA_UINT64
);
230 kstat_install(xnbp
->xnb_kstat_aux
);
236 xnb_ks_free(xnb_t
*xnbp
)
238 kstat_delete(xnbp
->xnb_kstat_aux
);
242 * Calculate and insert the transport checksum for an arbitrary packet.
245 xnb_software_csum(xnb_t
*xnbp
, mblk_t
*mp
)
247 _NOTE(ARGUNUSED(xnbp
));
250 * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
251 * because it doesn't cover all of the interesting cases :-(
253 mac_hcksum_set(mp
, 0, 0, 0, 0, HCK_FULLCKSUM
);
255 return (mac_fix_cksum(mp
));
259 xnb_process_cksum_flags(xnb_t
*xnbp
, mblk_t
*mp
, uint32_t capab
)
261 struct ether_header
*ehp
;
266 ASSERT(mp
->b_next
== NULL
);
269 * Check that the packet is contained in a single mblk. In
270 * the "from peer" path this is true today, but may change
271 * when scatter gather support is added. In the "to peer"
272 * path we cannot be sure, but in most cases it will be true
273 * (in the xnbo case the packet has come from a MAC device
274 * which is unlikely to split packets).
276 if (mp
->b_cont
!= NULL
)
280 * If the MAC has no hardware capability don't do any further
286 ASSERT(MBLKL(mp
) >= sizeof (struct ether_header
));
287 ehp
= (struct ether_header
*)mp
->b_rptr
;
289 if (ntohs(ehp
->ether_type
) == VLAN_TPID
) {
290 struct ether_vlan_header
*evhp
;
292 ASSERT(MBLKL(mp
) >= sizeof (struct ether_vlan_header
));
293 evhp
= (struct ether_vlan_header
*)mp
->b_rptr
;
294 sap
= ntohs(evhp
->ether_type
);
295 offset
= sizeof (struct ether_vlan_header
);
297 sap
= ntohs(ehp
->ether_type
);
298 offset
= sizeof (struct ether_header
);
302 * We only attempt to do IPv4 packets in hardware.
304 if (sap
!= ETHERTYPE_IP
)
308 * We know that this is an IPv4 packet.
310 ipha
= (ipha_t
*)(mp
->b_rptr
+ offset
);
312 switch (ipha
->ipha_protocol
) {
315 uint32_t start
, length
, stuff
, cksum
;
319 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
320 * can use full IPv4 and partial checksum offload.
322 if ((capab
& (HCKSUM_INET_FULL_V4
|HCKSUM_INET_PARTIAL
)) == 0)
325 start
= IP_SIMPLE_HDR_LENGTH
;
326 length
= ntohs(ipha
->ipha_length
);
327 if (ipha
->ipha_protocol
== IPPROTO_TCP
) {
328 stuff
= start
+ TCP_CHECKSUM_OFFSET
;
329 cksum
= IP_TCP_CSUM_COMP
;
331 stuff
= start
+ UDP_CHECKSUM_OFFSET
;
332 cksum
= IP_UDP_CSUM_COMP
;
334 stuffp
= (uint16_t *)(mp
->b_rptr
+ offset
+ stuff
);
336 if (capab
& HCKSUM_INET_FULL_V4
) {
338 * Some devices require that the checksum
339 * field of the packet is zero for full
344 mac_hcksum_set(mp
, 0, 0, 0, 0, HCK_FULLCKSUM
);
346 xnbp
->xnb_stat_csum_hardware
++;
351 if (capab
& HCKSUM_INET_PARTIAL
) {
356 * Older Solaris guests don't insert
357 * the pseudo-header checksum, so we
360 src
= ipha
->ipha_src
;
361 dst
= ipha
->ipha_dst
;
363 cksum
+= (dst
>> 16) + (dst
& 0xFFFF);
364 cksum
+= (src
>> 16) + (src
& 0xFFFF);
365 cksum
+= length
- IP_SIMPLE_HDR_LENGTH
;
367 cksum
= (cksum
>> 16) + (cksum
& 0xFFFF);
368 cksum
= (cksum
>> 16) + (cksum
& 0xFFFF);
370 ASSERT(cksum
<= 0xFFFF);
372 *stuffp
= (uint16_t)(cksum
? cksum
: ~cksum
);
375 mac_hcksum_set(mp
, start
, stuff
, length
, 0,
378 xnbp
->xnb_stat_csum_hardware
++;
394 * We are not able to use any offload so do the whole thing in
397 xnbp
->xnb_stat_csum_software
++;
399 return (xnb_software_csum(xnbp
, mp
));
403 xnb_attach(dev_info_t
*dip
, xnb_flavour_t
*flavour
, void *flavour_data
)
409 xnbp
= kmem_zalloc(sizeof (*xnbp
), KM_SLEEP
);
411 xnbp
->xnb_flavour
= flavour
;
412 xnbp
->xnb_flavour_data
= flavour_data
;
413 xnbp
->xnb_devinfo
= dip
;
414 xnbp
->xnb_evtchn
= INVALID_EVTCHN
;
415 xnbp
->xnb_irq
= B_FALSE
;
416 xnbp
->xnb_tx_ring_handle
= INVALID_GRANT_HANDLE
;
417 xnbp
->xnb_rx_ring_handle
= INVALID_GRANT_HANDLE
;
418 xnbp
->xnb_connected
= B_FALSE
;
419 xnbp
->xnb_hotplugged
= B_FALSE
;
420 xnbp
->xnb_detachable
= B_FALSE
;
421 xnbp
->xnb_peer
= xvdi_get_oeid(dip
);
422 xnbp
->xnb_be_status
= XNB_STATE_INIT
;
423 xnbp
->xnb_fe_status
= XNB_STATE_INIT
;
425 xnbp
->xnb_tx_buf_count
= 0;
427 xnbp
->xnb_rx_hv_copy
= B_FALSE
;
428 xnbp
->xnb_multicast_control
= B_FALSE
;
430 xnbp
->xnb_rx_va
= vmem_alloc(heap_arena
, PAGESIZE
, VM_SLEEP
);
431 ASSERT(xnbp
->xnb_rx_va
!= NULL
);
433 if (ddi_get_iblock_cookie(dip
, 0, &xnbp
->xnb_icookie
)
437 /* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
438 xnbp
->xnb_rx_cpop
= NULL
;
439 xnbp
->xnb_rx_cpop_count
= 0;
441 mutex_init(&xnbp
->xnb_tx_lock
, NULL
, MUTEX_DRIVER
,
443 mutex_init(&xnbp
->xnb_rx_lock
, NULL
, MUTEX_DRIVER
,
445 mutex_init(&xnbp
->xnb_state_lock
, NULL
, MUTEX_DRIVER
,
448 /* Set driver private pointer now. */
449 ddi_set_driver_private(dip
, xnbp
);
451 (void) sprintf(cachename
, "xnb_tx_buf_cache_%d", ddi_get_instance(dip
));
452 xnbp
->xnb_tx_buf_cache
= kmem_cache_create(cachename
,
453 sizeof (xnb_txbuf_t
), 0,
454 xnb_txbuf_constructor
, xnb_txbuf_destructor
,
455 NULL
, xnbp
, NULL
, 0);
456 if (xnbp
->xnb_tx_buf_cache
== NULL
)
459 if (!xnb_ks_init(xnbp
))
463 * Receive notification of changes in the state of the
464 * driver in the guest domain.
466 if (xvdi_add_event_handler(dip
, XS_OE_STATE
, xnb_oe_state_change
,
467 NULL
) != DDI_SUCCESS
)
471 * Receive notification of hotplug events.
473 if (xvdi_add_event_handler(dip
, XS_HP_STATE
, xnb_hp_state_change
,
474 NULL
) != DDI_SUCCESS
)
477 xsname
= xvdi_get_xsname(dip
);
479 if (xenbus_printf(XBT_NULL
, xsname
,
480 "feature-multicast-control", "%d",
481 xnb_multicast_control
? 1 : 0) != 0)
484 if (xenbus_printf(XBT_NULL
, xsname
,
485 "feature-rx-copy", "%d", 1) != 0)
488 * Linux domUs seem to depend on "feature-rx-flip" being 0
489 * in addition to "feature-rx-copy" being 1. It seems strange
490 * to use four possible states to describe a binary decision,
491 * but we might as well play nice.
493 if (xenbus_printf(XBT_NULL
, xsname
,
494 "feature-rx-flip", "%d", 0) != 0)
497 (void) xvdi_switch_state(dip
, XBT_NULL
, XenbusStateInitWait
);
498 (void) xvdi_post_event(dip
, XEN_HP_ADD
);
500 return (DDI_SUCCESS
);
503 xvdi_remove_event_handler(dip
, NULL
);
509 kmem_cache_destroy(xnbp
->xnb_tx_buf_cache
);
512 mutex_destroy(&xnbp
->xnb_state_lock
);
513 mutex_destroy(&xnbp
->xnb_rx_lock
);
514 mutex_destroy(&xnbp
->xnb_tx_lock
);
517 vmem_free(heap_arena
, xnbp
->xnb_rx_va
, PAGESIZE
);
518 kmem_free(xnbp
, sizeof (*xnbp
));
519 return (DDI_FAILURE
);
523 xnb_detach(dev_info_t
*dip
)
525 xnb_t
*xnbp
= ddi_get_driver_private(dip
);
527 ASSERT(xnbp
!= NULL
);
528 ASSERT(!xnbp
->xnb_connected
);
529 ASSERT(xnbp
->xnb_tx_buf_count
== 0);
531 xnb_disconnect_rings(dip
);
533 xvdi_remove_event_handler(dip
, NULL
);
537 kmem_cache_destroy(xnbp
->xnb_tx_buf_cache
);
539 ddi_set_driver_private(dip
, NULL
);
541 mutex_destroy(&xnbp
->xnb_state_lock
);
542 mutex_destroy(&xnbp
->xnb_rx_lock
);
543 mutex_destroy(&xnbp
->xnb_tx_lock
);
545 if (xnbp
->xnb_rx_cpop_count
> 0)
546 kmem_free(xnbp
->xnb_rx_cpop
, sizeof (xnbp
->xnb_rx_cpop
[0])
547 * xnbp
->xnb_rx_cpop_count
);
549 ASSERT(xnbp
->xnb_rx_va
!= NULL
);
550 vmem_free(heap_arena
, xnbp
->xnb_rx_va
, PAGESIZE
);
552 kmem_free(xnbp
, sizeof (*xnbp
));
556 * Allocate a page from the hypervisor to be flipped to the peer.
558 * Try to get pages in batches to reduce the overhead of calls into
559 * the balloon driver.
562 xnb_alloc_page(xnb_t
*xnbp
)
564 #define WARNING_RATE_LIMIT 100
565 #define BATCH_SIZE 256
566 static mfn_t mfns
[BATCH_SIZE
]; /* common across all instances */
567 static int nth
= BATCH_SIZE
;
570 mutex_enter(&xnb_alloc_page_lock
);
571 if (nth
== BATCH_SIZE
) {
572 if (balloon_alloc_pages(BATCH_SIZE
, mfns
) != BATCH_SIZE
) {
573 xnbp
->xnb_stat_allocation_failure
++;
574 mutex_exit(&xnb_alloc_page_lock
);
577 * Try for a single page in low memory situations.
579 if (balloon_alloc_pages(1, &mfn
) != 1) {
580 if ((xnbp
->xnb_stat_small_allocation_failure
++
581 % WARNING_RATE_LIMIT
) == 0)
582 cmn_err(CE_WARN
, "xnb_alloc_page: "
583 "Cannot allocate memory to "
584 "transfer packets to peer.");
587 xnbp
->xnb_stat_small_allocation_success
++;
593 xnbp
->xnb_stat_allocation_success
++;
597 mutex_exit(&xnb_alloc_page_lock
);
603 #undef WARNING_RATE_LIMIT
607 * Free a page back to the hypervisor.
609 * This happens only in the error path, so batching is not worth the
613 xnb_free_page(xnb_t
*xnbp
, mfn_t mfn
)
615 _NOTE(ARGUNUSED(xnbp
));
619 pfn
= xen_assign_pfn(mfn
);
620 pfnzero(pfn
, 0, PAGESIZE
);
621 xen_release_pfn(pfn
);
623 if ((r
= balloon_free_pages(1, &mfn
, NULL
, NULL
)) != 1) {
624 cmn_err(CE_WARN
, "free_page: cannot decrease memory "
625 "reservation (%d): page kept but unusable (mfn = 0x%lx).",
631 * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
632 * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
634 #define XNB_RING_HAS_UNCONSUMED_REQUESTS(_r) \
635 ((((_r)->sring->req_prod - loop) < \
636 (RING_SIZE(_r) - (loop - prod))) ? \
637 ((_r)->sring->req_prod - loop) : \
638 (RING_SIZE(_r) - (loop - prod)))
641 * Pass packets to the peer using page flipping.
644 xnb_to_peer(xnb_t
*xnbp
, mblk_t
*mp
)
646 mblk_t
*free
= mp
, *prev
= NULL
;
648 gnttab_transfer_t
*gop
;
650 RING_IDX loop
, prod
, end
;
653 * For each packet the sequence of operations is:
655 * 1. get a new page from the hypervisor.
656 * 2. get a request slot from the ring.
657 * 3. copy the data into the new page.
658 * 4. transfer the page to the peer.
659 * 5. update the request slot.
663 * In order to reduce the number of hypercalls, we prepare
664 * several packets for the peer and perform a single hypercall
668 mutex_enter(&xnbp
->xnb_rx_lock
);
671 * If we are not connected to the peer or have not yet
672 * finished hotplug it is too early to pass packets to the
675 if (!(xnbp
->xnb_connected
&& xnbp
->xnb_hotplugged
)) {
676 mutex_exit(&xnbp
->xnb_rx_lock
);
677 DTRACE_PROBE(flip_rx_too_early
);
678 xnbp
->xnb_stat_rx_too_early
++;
682 loop
= xnbp
->xnb_rx_ring
.req_cons
;
683 prod
= xnbp
->xnb_rx_ring
.rsp_prod_pvt
;
684 gop
= xnbp
->xnb_rx_top
;
686 while ((mp
!= NULL
) &&
687 XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp
->xnb_rx_ring
)) {
691 netif_rx_request_t
*rxreq
;
692 netif_rx_response_t
*rxresp
;
695 uint16_t cksum_flags
;
698 if ((mfn
= xnb_alloc_page(xnbp
)) == 0) {
699 xnbp
->xnb_stat_rx_defer
++;
704 rxreq
= RING_GET_REQUEST(&xnbp
->xnb_rx_ring
, loop
);
707 if (!(rxreq
->id
< NET_RX_RING_SIZE
))
708 cmn_err(CE_PANIC
, "xnb_to_peer: "
709 "id %d out of range in request 0x%p",
710 rxreq
->id
, (void *)rxreq
);
711 #endif /* XNB_DEBUG */
713 /* Assign a pfn and map the new page at the allocated va. */
714 pfn
= xen_assign_pfn(mfn
);
715 hat_devload(kas
.a_hat
, xnbp
->xnb_rx_va
, PAGESIZE
,
716 pfn
, PROT_READ
| PROT_WRITE
, HAT_LOAD
);
720 valoop
= xnbp
->xnb_rx_va
;
721 for (ml
= mp
; ml
!= NULL
; ml
= ml
->b_cont
) {
722 size_t chunk
= ml
->b_wptr
- ml
->b_rptr
;
724 bcopy(ml
->b_rptr
, valoop
, chunk
);
729 ASSERT(len
< PAGESIZE
);
731 /* Release the pfn. */
732 hat_unload(kas
.a_hat
, xnbp
->xnb_rx_va
, PAGESIZE
,
734 xen_release_pfn(pfn
);
738 gop
->domid
= xnbp
->xnb_peer
;
739 gop
->ref
= rxreq
->gref
;
742 rxresp
= RING_GET_RESPONSE(&xnbp
->xnb_rx_ring
, prod
);
746 cksum_flags
= xnbp
->xnb_flavour
->xf_cksum_to_peer(xnbp
, mp
);
747 if (cksum_flags
!= 0)
748 xnbp
->xnb_stat_rx_cksum_deferred
++;
749 rxresp
->flags
|= cksum_flags
;
751 rxresp
->id
= RING_GET_REQUEST(&xnbp
->xnb_rx_ring
, prod
)->id
;
752 rxresp
->status
= len
;
762 * Did we actually do anything?
764 if (loop
== xnbp
->xnb_rx_ring
.req_cons
) {
765 mutex_exit(&xnbp
->xnb_rx_lock
);
772 * Unlink the end of the 'done' list from the remainder.
774 ASSERT(prev
!= NULL
);
777 if (HYPERVISOR_grant_table_op(GNTTABOP_transfer
, xnbp
->xnb_rx_top
,
778 loop
- xnbp
->xnb_rx_ring
.req_cons
) != 0) {
779 cmn_err(CE_WARN
, "xnb_to_peer: transfer operation failed");
782 loop
= xnbp
->xnb_rx_ring
.req_cons
;
783 prod
= xnbp
->xnb_rx_ring
.rsp_prod_pvt
;
784 gop
= xnbp
->xnb_rx_top
;
787 int16_t status
= NETIF_RSP_OKAY
;
789 if (gop
->status
!= 0) {
790 status
= NETIF_RSP_ERROR
;
793 * If the status is anything other than
794 * GNTST_bad_page then we don't own the page
795 * any more, so don't try to give it back.
797 if (gop
->status
!= GNTST_bad_page
)
800 /* The page is no longer ours. */
806 * Give back the page, as we won't be using
809 xnb_free_page(xnbp
, gop
->mfn
);
812 * We gave away a page, update our accounting
815 balloon_drv_subtracted(1);
818 if (status
!= NETIF_RSP_OKAY
) {
819 RING_GET_RESPONSE(&xnbp
->xnb_rx_ring
, prod
)->status
=
822 xnbp
->xnb_stat_ipackets
++;
823 xnbp
->xnb_stat_rbytes
+= len
;
831 xnbp
->xnb_rx_ring
.req_cons
= loop
;
832 xnbp
->xnb_rx_ring
.rsp_prod_pvt
= prod
;
835 /* LINTED: constant in conditional context */
836 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp
->xnb_rx_ring
, notify
);
838 ec_notify_via_evtchn(xnbp
->xnb_evtchn
);
839 xnbp
->xnb_stat_rx_notify_sent
++;
841 xnbp
->xnb_stat_rx_notify_deferred
++;
845 xnbp
->xnb_stat_rx_defer
++;
847 mutex_exit(&xnbp
->xnb_rx_lock
);
849 /* Free mblk_t's that we consumed. */
855 /* Helper functions for xnb_copy_to_peer(). */
858 * Grow the array of copy operation descriptors.
861 grow_cpop_area(xnb_t
*xnbp
)
866 ASSERT(MUTEX_HELD(&xnbp
->xnb_rx_lock
));
868 count
= xnbp
->xnb_rx_cpop_count
+ CPOP_DEFCNT
;
870 if ((new = kmem_alloc(sizeof (new[0]) * count
, KM_NOSLEEP
)) == NULL
) {
871 xnbp
->xnb_stat_other_allocation_failure
++;
875 bcopy(xnbp
->xnb_rx_cpop
, new,
876 sizeof (xnbp
->xnb_rx_cpop
[0]) * xnbp
->xnb_rx_cpop_count
);
878 kmem_free(xnbp
->xnb_rx_cpop
,
879 sizeof (xnbp
->xnb_rx_cpop
[0]) * xnbp
->xnb_rx_cpop_count
);
881 xnbp
->xnb_rx_cpop
= new;
882 xnbp
->xnb_rx_cpop_count
= count
;
884 xnbp
->xnb_stat_rx_cpoparea_grown
++;
890 * Check whether an address is on a page that's foreign to this domain.
893 is_foreign(void *addr
)
895 pfn_t pfn
= hat_getpfnum(kas
.a_hat
, addr
);
897 return ((pfn
& PFN_IS_FOREIGN_MFN
) == PFN_IS_FOREIGN_MFN
);
901 * Insert a newly allocated mblk into a chain, replacing the old one.
904 replace_msg(mblk_t
*mp
, size_t len
, mblk_t
*mp_prev
, mblk_t
*ml_prev
)
906 uint32_t start
, stuff
, end
, value
, flags
;
910 if (new_mp
== NULL
) {
911 cmn_err(CE_PANIC
, "replace_msg: cannot alloc new message"
912 "for %p, len %lu", (void *) mp
, len
);
915 mac_hcksum_get(mp
, &start
, &stuff
, &end
, &value
, &flags
);
916 mac_hcksum_set(new_mp
, start
, stuff
, end
, value
, flags
);
918 new_mp
->b_next
= mp
->b_next
;
919 new_mp
->b_prev
= mp
->b_prev
;
920 new_mp
->b_cont
= mp
->b_cont
;
922 /* Make sure we only overwrite pointers to the mblk being replaced. */
923 if (mp_prev
!= NULL
&& mp_prev
->b_next
== mp
)
924 mp_prev
->b_next
= new_mp
;
926 if (ml_prev
!= NULL
&& ml_prev
->b_cont
== mp
)
927 ml_prev
->b_cont
= new_mp
;
929 mp
->b_next
= mp
->b_prev
= mp
->b_cont
= NULL
;
936 * Set all the fields in a gnttab_copy_t.
939 setup_gop(xnb_t
*xnbp
, gnttab_copy_t
*gp
, uchar_t
*rptr
,
940 size_t s_off
, size_t d_off
, size_t len
, grant_ref_t d_ref
)
942 ASSERT(xnbp
!= NULL
&& gp
!= NULL
);
944 gp
->source
.offset
= s_off
;
945 gp
->source
.u
.gmfn
= pfn_to_mfn(hat_getpfnum(kas
.a_hat
, (caddr_t
)rptr
));
946 gp
->source
.domid
= DOMID_SELF
;
948 gp
->len
= (uint16_t)len
;
949 gp
->flags
= GNTCOPY_dest_gref
;
952 gp
->dest
.u
.ref
= d_ref
;
953 gp
->dest
.offset
= d_off
;
954 gp
->dest
.domid
= xnbp
->xnb_peer
;
958 * Pass packets to the peer using hypervisor copy operations.
961 xnb_copy_to_peer(xnb_t
*xnbp
, mblk_t
*mp
)
963 mblk_t
*free
= mp
, *mp_prev
= NULL
, *saved_mp
= mp
;
964 mblk_t
*ml
, *ml_prev
;
970 * If the peer does not pre-post buffers for received packets,
971 * use page flipping to pass packets to it.
973 if (!xnbp
->xnb_rx_hv_copy
)
974 return (xnb_to_peer(xnbp
, mp
));
977 * For each packet the sequence of operations is:
979 * 1. get a request slot from the ring.
980 * 2. set up data for hypercall (see NOTE below)
981 * 3. have the hypervisore copy the data
982 * 4. update the request slot.
986 * In order to reduce the number of hypercalls, we prepare
987 * several mblks (mp->b_cont != NULL) for the peer and
988 * perform a single hypercall to transfer them. We also have
989 * to set up a seperate copy operation for every page.
991 * If we have more than one packet (mp->b_next != NULL), we do
992 * this whole dance repeatedly.
995 mutex_enter(&xnbp
->xnb_rx_lock
);
997 if (!(xnbp
->xnb_connected
&& xnbp
->xnb_hotplugged
)) {
998 mutex_exit(&xnbp
->xnb_rx_lock
);
999 DTRACE_PROBE(copy_rx_too_early
);
1000 xnbp
->xnb_stat_rx_too_early
++;
1004 loop
= xnbp
->xnb_rx_ring
.req_cons
;
1005 prod
= xnbp
->xnb_rx_ring
.rsp_prod_pvt
;
1007 while ((mp
!= NULL
) &&
1008 XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp
->xnb_rx_ring
)) {
1009 netif_rx_request_t
*rxreq
;
1010 size_t d_offset
, len
;
1012 gnttab_copy_t
*gop_cp
;
1013 netif_rx_response_t
*rxresp
;
1014 uint16_t cksum_flags
;
1015 int16_t status
= NETIF_RSP_OKAY
;
1018 rxreq
= RING_GET_REQUEST(&xnbp
->xnb_rx_ring
, loop
);
1021 if (!(rxreq
->id
< NET_RX_RING_SIZE
))
1022 cmn_err(CE_PANIC
, "xnb_copy_to_peer: "
1023 "id %d out of range in request 0x%p",
1024 rxreq
->id
, (void *)rxreq
);
1025 #endif /* XNB_DEBUG */
1032 gop_cp
= xnbp
->xnb_rx_cpop
;
1035 * We walk the b_cont pointers and set up a
1036 * gnttab_copy_t for each sub-page chunk in each data
1040 for (ml
= mp
, ml_prev
= NULL
; ml
!= NULL
; ml
= ml
->b_cont
) {
1041 size_t chunk
= ml
->b_wptr
- ml
->b_rptr
;
1042 uchar_t
*r_tmp
, *rpt_align
;
1046 * The hypervisor will not allow us to
1047 * reference a foreign page (e.g. one
1048 * belonging to another domain) by mfn in the
1049 * copy operation. If the data in this mblk is
1050 * on such a page we must copy the data into a
1051 * local page before initiating the hypervisor
1054 if (is_foreign(ml
->b_rptr
) || is_foreign(ml
->b_wptr
)) {
1055 mblk_t
*ml_new
= replace_msg(ml
, chunk
,
1058 /* We can still use old ml, but not *ml! */
1065 xnbp
->xnb_stat_rx_foreign_page
++;
1068 rpt_align
= (uchar_t
*)ALIGN2PAGE(ml
->b_rptr
);
1069 r_offset
= (uint16_t)(ml
->b_rptr
- rpt_align
);
1072 if (d_offset
+ chunk
> PAGESIZE
)
1073 cmn_err(CE_PANIC
, "xnb_copy_to_peer: mp %p "
1074 "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1075 "(%lu) + chunk (%lu) > PAGESIZE %d!",
1076 (void *)mp
, (void *)saved_mp
, (void *)ml
,
1078 d_offset
, chunk
, (int)PAGESIZE
);
1083 if (item_count
== xnbp
->xnb_rx_cpop_count
) {
1084 if (!grow_cpop_area(xnbp
))
1086 gop_cp
= &xnbp
->xnb_rx_cpop
[item_count
];
1089 * If our mblk crosses a page boundary, we need
1090 * to do a seperate copy for each page.
1092 if (r_offset
+ chunk
> PAGESIZE
) {
1093 part_len
= PAGESIZE
- r_offset
;
1095 DTRACE_PROBE3(mblk_page_crossed
,
1096 (mblk_t
*), ml
, int, chunk
, int,
1099 xnbp
->xnb_stat_rx_pagebndry_crossed
++;
1104 setup_gop(xnbp
, gop_cp
, r_tmp
, r_offset
,
1105 d_offset
, part_len
, rxreq
->gref
);
1110 d_offset
+= part_len
;
1113 * The 2nd, 3rd ... last copies will always
1114 * start at r_tmp, therefore r_offset is 0.
1122 DTRACE_PROBE4(mblk_loop_end
, (mblk_t
*), ml
, int,
1123 chunk
, int, len
, int, item_count
);
1126 if (HYPERVISOR_grant_table_op(GNTTABOP_copy
, xnbp
->xnb_rx_cpop
,
1128 cmn_err(CE_WARN
, "xnb_copy_to_peer: copy op. failed");
1129 DTRACE_PROBE(HV_granttableopfailed
);
1133 rxresp
= RING_GET_RESPONSE(&xnbp
->xnb_rx_ring
, prod
);
1138 DTRACE_PROBE4(got_RX_rsp
, int, (int)rxresp
->id
, int,
1139 (int)rxresp
->offset
, int, (int)rxresp
->flags
, int,
1140 (int)rxresp
->status
);
1142 cksum_flags
= xnbp
->xnb_flavour
->xf_cksum_to_peer(xnbp
, mp
);
1143 if (cksum_flags
!= 0)
1144 xnbp
->xnb_stat_rx_cksum_deferred
++;
1145 rxresp
->flags
|= cksum_flags
;
1147 rxresp
->id
= RING_GET_REQUEST(&xnbp
->xnb_rx_ring
, prod
)->id
;
1148 rxresp
->status
= len
;
1150 DTRACE_PROBE4(RX_rsp_set
, int, (int)rxresp
->id
, int,
1151 (int)rxresp
->offset
, int, (int)rxresp
->flags
, int,
1152 (int)rxresp
->status
);
1154 for (i
= 0; i
< item_count
; i
++) {
1155 if (xnbp
->xnb_rx_cpop
[i
].status
!= 0) {
1156 DTRACE_PROBE2(cpop_status_nonnull
, int,
1157 (int)xnbp
->xnb_rx_cpop
[i
].status
,
1159 status
= NETIF_RSP_ERROR
;
1164 if (status
!= NETIF_RSP_OKAY
) {
1165 RING_GET_RESPONSE(&xnbp
->xnb_rx_ring
, prod
)->status
=
1167 xnbp
->xnb_stat_rx_rsp_notok
++;
1169 xnbp
->xnb_stat_ipackets
++;
1170 xnbp
->xnb_stat_rbytes
+= len
;
1180 * Did we actually do anything?
1182 if (loop
== xnbp
->xnb_rx_ring
.req_cons
) {
1183 mutex_exit(&xnbp
->xnb_rx_lock
);
1188 * Unlink the end of the 'done' list from the remainder.
1190 ASSERT(mp_prev
!= NULL
);
1191 mp_prev
->b_next
= NULL
;
1193 xnbp
->xnb_rx_ring
.req_cons
= loop
;
1194 xnbp
->xnb_rx_ring
.rsp_prod_pvt
= prod
;
1197 /* LINTED: constant in conditional context */
1198 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp
->xnb_rx_ring
, notify
);
1200 ec_notify_via_evtchn(xnbp
->xnb_evtchn
);
1201 xnbp
->xnb_stat_rx_notify_sent
++;
1203 xnbp
->xnb_stat_rx_notify_deferred
++;
1207 xnbp
->xnb_stat_rx_defer
++;
1209 mutex_exit(&xnbp
->xnb_rx_lock
);
1211 /* Free mblk_t structs we have consumed. */
1219 xnb_tx_notify_peer(xnb_t
*xnbp
, boolean_t force
)
1223 ASSERT(MUTEX_HELD(&xnbp
->xnb_tx_lock
));
1225 /* LINTED: constant in conditional context */
1226 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp
->xnb_tx_ring
, notify
);
1227 if (notify
|| force
) {
1228 ec_notify_via_evtchn(xnbp
->xnb_evtchn
);
1229 xnbp
->xnb_stat_tx_notify_sent
++;
1231 xnbp
->xnb_stat_tx_notify_deferred
++;
1236 xnb_tx_mark_complete(xnb_t
*xnbp
, RING_IDX id
, int16_t status
)
1239 netif_tx_response_t
*txresp
;
1241 ASSERT(MUTEX_HELD(&xnbp
->xnb_tx_lock
));
1243 i
= xnbp
->xnb_tx_ring
.rsp_prod_pvt
;
1245 txresp
= RING_GET_RESPONSE(&xnbp
->xnb_tx_ring
, i
);
1247 txresp
->status
= status
;
1249 xnbp
->xnb_tx_ring
.rsp_prod_pvt
= i
+ 1;
1252 * Note that we don't push the change to the peer here - that
1253 * is the callers responsibility.
1258 xnb_txbuf_recycle(xnb_txbuf_t
*txp
)
1260 xnb_t
*xnbp
= txp
->xt_xnbp
;
1262 kmem_cache_free(xnbp
->xnb_tx_buf_cache
, txp
);
1264 xnbp
->xnb_tx_buf_outstanding
--;
1268 xnb_txbuf_constructor(void *buf
, void *arg
, int kmflag
)
1270 _NOTE(ARGUNUSED(kmflag
));
1271 xnb_txbuf_t
*txp
= buf
;
1274 ddi_dma_cookie_t dma_cookie
;
1277 txp
->xt_free_rtn
.free_func
= xnb_txbuf_recycle
;
1278 txp
->xt_free_rtn
.free_arg
= (caddr_t
)txp
;
1279 txp
->xt_xnbp
= xnbp
;
1280 txp
->xt_next
= NULL
;
1282 if (ddi_dma_alloc_handle(xnbp
->xnb_devinfo
, &buf_dma_attr
,
1283 0, 0, &txp
->xt_dma_handle
) != DDI_SUCCESS
)
1286 if (ddi_dma_mem_alloc(txp
->xt_dma_handle
, PAGESIZE
, &data_accattr
,
1287 DDI_DMA_STREAMING
, 0, 0, &txp
->xt_buf
, &len
,
1288 &txp
->xt_acc_handle
) != DDI_SUCCESS
)
1291 if (ddi_dma_addr_bind_handle(txp
->xt_dma_handle
, NULL
, txp
->xt_buf
,
1292 len
, DDI_DMA_RDWR
| DDI_DMA_STREAMING
, DDI_DMA_DONTWAIT
, 0,
1293 &dma_cookie
, &ncookies
)
1296 ASSERT(ncookies
== 1);
1298 txp
->xt_mfn
= xnb_btop(dma_cookie
.dmac_laddress
);
1299 txp
->xt_buflen
= dma_cookie
.dmac_size
;
1301 DTRACE_PROBE(txbuf_allocated
);
1303 atomic_inc_32(&xnbp
->xnb_tx_buf_count
);
1304 xnbp
->xnb_tx_buf_outstanding
++;
1309 ddi_dma_mem_free(&txp
->xt_acc_handle
);
1312 ddi_dma_free_handle(&txp
->xt_dma_handle
);
1320 xnb_txbuf_destructor(void *buf
, void *arg
)
1322 xnb_txbuf_t
*txp
= buf
;
1325 (void) ddi_dma_unbind_handle(txp
->xt_dma_handle
);
1326 ddi_dma_mem_free(&txp
->xt_acc_handle
);
1327 ddi_dma_free_handle(&txp
->xt_dma_handle
);
1329 atomic_dec_32(&xnbp
->xnb_tx_buf_count
);
1333 * Take packets from the peer and deliver them onward.
1336 xnb_from_peer(xnb_t
*xnbp
)
1338 RING_IDX start
, end
, loop
;
1341 netif_tx_request_t
*txreq
;
1342 boolean_t work_to_do
, need_notify
= B_FALSE
;
1343 mblk_t
*head
, *tail
;
1346 ASSERT(MUTEX_HELD(&xnbp
->xnb_tx_lock
));
1351 /* LINTED: constant in conditional context */
1352 RING_FINAL_CHECK_FOR_REQUESTS(&xnbp
->xnb_tx_ring
, work_to_do
);
1355 xnb_tx_notify_peer(xnbp
, need_notify
);
1360 start
= xnbp
->xnb_tx_ring
.req_cons
;
1361 end
= xnbp
->xnb_tx_ring
.sring
->req_prod
;
1363 if ((end
- start
) > NET_TX_RING_SIZE
) {
1365 * This usually indicates that the frontend driver is
1366 * misbehaving, as it's not possible to have more than
1367 * NET_TX_RING_SIZE ring elements in play at any one
1370 * We reset the ring pointers to the state declared by
1371 * the frontend and try to carry on.
1373 cmn_err(CE_WARN
, "xnb_from_peer: domain %d tried to give us %u "
1374 "items in the ring, resetting and trying to recover.",
1375 xnbp
->xnb_peer
, (end
- start
));
1377 /* LINTED: constant in conditional context */
1378 BACK_RING_ATTACH(&xnbp
->xnb_tx_ring
,
1379 (netif_tx_sring_t
*)xnbp
->xnb_tx_ring_addr
, PAGESIZE
);
1385 cop
= xnbp
->xnb_tx_cop
;
1386 txpp
= xnbp
->xnb_tx_bufp
;
1389 while (loop
< end
) {
1390 static const uint16_t acceptable_flags
=
1392 NETTXF_data_validated
|
1394 uint16_t unexpected_flags
;
1396 txreq
= RING_GET_REQUEST(&xnbp
->xnb_tx_ring
, loop
);
1398 unexpected_flags
= txreq
->flags
& ~acceptable_flags
;
1399 if (unexpected_flags
!= 0) {
1401 * The peer used flag bits that we do not
1404 cmn_err(CE_WARN
, "xnb_from_peer: "
1405 "unexpected flag bits (0x%x) from peer "
1406 "in transmit request",
1408 xnbp
->xnb_stat_tx_unexpected_flags
++;
1410 /* Mark this entry as failed. */
1411 xnb_tx_mark_complete(xnbp
, txreq
->id
, NETIF_RSP_ERROR
);
1412 need_notify
= B_TRUE
;
1414 } else if (txreq
->flags
& NETTXF_extra_info
) {
1415 struct netif_extra_info
*erp
;
1418 loop
++; /* Consume another slot in the ring. */
1419 ASSERT(loop
<= end
);
1421 erp
= (struct netif_extra_info
*)
1422 RING_GET_REQUEST(&xnbp
->xnb_tx_ring
, loop
);
1424 switch (erp
->type
) {
1425 case XEN_NETIF_EXTRA_TYPE_MCAST_ADD
:
1426 ASSERT(xnbp
->xnb_multicast_control
);
1427 status
= xnbp
->xnb_flavour
->xf_mcast_add(xnbp
,
1428 &erp
->u
.mcast
.addr
);
1430 case XEN_NETIF_EXTRA_TYPE_MCAST_DEL
:
1431 ASSERT(xnbp
->xnb_multicast_control
);
1432 status
= xnbp
->xnb_flavour
->xf_mcast_del(xnbp
,
1433 &erp
->u
.mcast
.addr
);
1437 cmn_err(CE_WARN
, "xnb_from_peer: "
1438 "unknown extra type %d", erp
->type
);
1442 xnb_tx_mark_complete(xnbp
, txreq
->id
,
1443 status
? NETIF_RSP_OKAY
: NETIF_RSP_ERROR
);
1444 need_notify
= B_TRUE
;
1446 } else if ((txreq
->offset
> PAGESIZE
) ||
1447 (txreq
->offset
+ txreq
->size
> PAGESIZE
)) {
1449 * Peer attempted to refer to data beyond the
1450 * end of the granted page.
1452 cmn_err(CE_WARN
, "xnb_from_peer: "
1453 "attempt to refer beyond the end of granted "
1454 "page in txreq (offset %d, size %d).",
1455 txreq
->offset
, txreq
->size
);
1456 xnbp
->xnb_stat_tx_overflow_page
++;
1458 /* Mark this entry as failed. */
1459 xnb_tx_mark_complete(xnbp
, txreq
->id
, NETIF_RSP_ERROR
);
1460 need_notify
= B_TRUE
;
1465 txp
= kmem_cache_alloc(xnbp
->xnb_tx_buf_cache
,
1470 txp
->xt_mblk
= desballoc((unsigned char *)txp
->xt_buf
,
1471 txp
->xt_buflen
, 0, &txp
->xt_free_rtn
);
1472 if (txp
->xt_mblk
== NULL
) {
1473 kmem_cache_free(xnbp
->xnb_tx_buf_cache
, txp
);
1478 txp
->xt_id
= txreq
->id
;
1480 cop
->source
.u
.ref
= txreq
->gref
;
1481 cop
->source
.domid
= xnbp
->xnb_peer
;
1482 cop
->source
.offset
= txreq
->offset
;
1484 cop
->dest
.u
.gmfn
= txp
->xt_mfn
;
1485 cop
->dest
.domid
= DOMID_SELF
;
1486 cop
->dest
.offset
= 0;
1488 cop
->len
= txreq
->size
;
1489 cop
->flags
= GNTCOPY_source_gref
;
1498 ASSERT(n_data_req
<= NET_TX_RING_SIZE
);
1504 xnbp
->xnb_tx_ring
.req_cons
= loop
;
1506 if (n_data_req
== 0)
1509 if (HYPERVISOR_grant_table_op(GNTTABOP_copy
,
1510 xnbp
->xnb_tx_cop
, n_data_req
) != 0) {
1512 cmn_err(CE_WARN
, "xnb_from_peer: copy operation failed");
1514 txpp
= xnbp
->xnb_tx_bufp
;
1517 kmem_cache_free(xnbp
->xnb_tx_buf_cache
, *txpp
);
1525 txpp
= xnbp
->xnb_tx_bufp
;
1526 cop
= xnbp
->xnb_tx_cop
;
1530 xnb_txbuf_t
*txp
= *txpp
;
1532 txreq
= RING_GET_REQUEST(&xnbp
->xnb_tx_ring
, txp
->xt_idx
);
1534 if (cop
->status
!= 0) {
1536 cmn_err(CE_WARN
, "xnb_from_peer: "
1537 "txpp 0x%p failed (%d)",
1538 (void *)*txpp
, cop
->status
);
1539 #endif /* XNB_DEBUG */
1540 xnb_tx_mark_complete(xnbp
, txp
->xt_id
, NETIF_RSP_ERROR
);
1541 freemsg(txp
->xt_mblk
);
1546 mp
->b_rptr
= mp
->b_wptr
= (unsigned char *)txp
->xt_buf
;
1547 mp
->b_wptr
+= txreq
->size
;
1551 * If there are checksum flags, process them
1555 (NETTXF_csum_blank
| NETTXF_data_validated
))
1557 mp
= xnbp
->xnb_flavour
->xf_cksum_from_peer(xnbp
,
1559 xnbp
->xnb_stat_tx_cksum_no_need
++;
1565 ASSERT(tail
== NULL
);
1568 ASSERT(tail
!= NULL
);
1573 xnbp
->xnb_stat_opackets
++;
1574 xnbp
->xnb_stat_obytes
+= txreq
->size
;
1576 xnb_tx_mark_complete(xnbp
, txp
->xt_id
, NETIF_RSP_OKAY
);
1589 xnb_intr(caddr_t arg
)
1591 xnb_t
*xnbp
= (xnb_t
*)arg
;
1594 xnbp
->xnb_stat_intr
++;
1596 mutex_enter(&xnbp
->xnb_tx_lock
);
1598 ASSERT(xnbp
->xnb_connected
);
1600 mp
= xnb_from_peer(xnbp
);
1602 mutex_exit(&xnbp
->xnb_tx_lock
);
1604 if (!xnbp
->xnb_hotplugged
) {
1605 xnbp
->xnb_stat_tx_too_early
++;
1609 xnbp
->xnb_stat_spurious_intr
++;
1613 xnbp
->xnb_flavour
->xf_from_peer(xnbp
, mp
);
1615 return (DDI_INTR_CLAIMED
);
1619 return (DDI_INTR_CLAIMED
);
1623 * Read our configuration from xenstore.
1626 xnb_read_xs_config(xnb_t
*xnbp
)
1629 char mac
[ETHERADDRL
* 3];
1631 xsname
= xvdi_get_xsname(xnbp
->xnb_devinfo
);
1633 if (xenbus_scanf(XBT_NULL
, xsname
,
1634 "mac", "%s", mac
) != 0) {
1635 cmn_err(CE_WARN
, "xnb_attach: "
1636 "cannot read mac address from %s",
1641 if (ether_aton(mac
, xnbp
->xnb_mac_addr
) != ETHERADDRL
) {
1643 "xnb_attach: cannot parse mac address %s",
1652 * Read the configuration of the peer from xenstore.
1655 xnb_read_oe_config(xnb_t
*xnbp
)
1660 oename
= xvdi_get_oename(xnbp
->xnb_devinfo
);
1662 if (xenbus_gather(XBT_NULL
, oename
,
1663 "event-channel", "%u", &xnbp
->xnb_fe_evtchn
,
1664 "tx-ring-ref", "%lu", &xnbp
->xnb_tx_ring_ref
,
1665 "rx-ring-ref", "%lu", &xnbp
->xnb_rx_ring_ref
,
1667 cmn_err(CE_WARN
, "xnb_read_oe_config: "
1668 "cannot read other-end details from %s",
1674 * Check whether our peer requests receive side hypervisor
1677 if (xenbus_scanf(XBT_NULL
, oename
,
1678 "request-rx-copy", "%d", &i
) != 0)
1681 xnbp
->xnb_rx_hv_copy
= B_TRUE
;
1684 * Check whether our peer requests multicast_control.
1686 if (xenbus_scanf(XBT_NULL
, oename
,
1687 "request-multicast-control", "%d", &i
) != 0)
1690 xnbp
->xnb_multicast_control
= B_TRUE
;
1693 * The Linux backend driver here checks to see if the peer has
1694 * set 'feature-no-csum-offload'. This is used to indicate
1695 * that the guest cannot handle receiving packets without a
1696 * valid checksum. We don't check here, because packets passed
1697 * to the peer _always_ have a valid checksum.
1699 * There are three cases:
1701 * - the NIC is dedicated: packets from the wire should always
1702 * have a valid checksum. If the hardware validates the
1703 * checksum then the relevant bit will be set in the packet
1704 * attributes and we will inform the peer. It can choose to
1705 * ignore the hardware verification.
1707 * - the NIC is shared (VNIC) and a packet originates from the
1708 * wire: this is the same as the case above - the packets
1709 * will have a valid checksum.
1711 * - the NIC is shared (VNIC) and a packet originates from the
1712 * host: the MAC layer ensures that all such packets have a
1713 * valid checksum by calculating one if the stack did not.
1720 xnb_start_connect(xnb_t
*xnbp
)
1722 dev_info_t
*dip
= xnbp
->xnb_devinfo
;
1724 if (!xnb_connect_rings(dip
)) {
1725 cmn_err(CE_WARN
, "xnb_start_connect: "
1726 "cannot connect rings");
1730 if (!xnbp
->xnb_flavour
->xf_start_connect(xnbp
)) {
1731 cmn_err(CE_WARN
, "xnb_start_connect: "
1732 "flavour failed to connect");
1736 (void) xvdi_switch_state(dip
, XBT_NULL
, XenbusStateConnected
);
1740 xnbp
->xnb_flavour
->xf_peer_disconnected(xnbp
);
1741 xnb_disconnect_rings(dip
);
1742 (void) xvdi_switch_state(dip
, XBT_NULL
,
1744 (void) xvdi_post_event(dip
, XEN_HP_REMOVE
);
1748 xnb_connect_rings(dev_info_t
*dip
)
1750 xnb_t
*xnbp
= ddi_get_driver_private(dip
);
1751 struct gnttab_map_grant_ref map_op
;
1754 * Cannot attempt to connect the rings if already connected.
1756 ASSERT(!xnbp
->xnb_connected
);
1759 * 1. allocate a vaddr for the tx page, one for the rx page.
1760 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1761 * into the allocated vaddr (one for tx, one for rx).
1762 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1763 * bound to this domain.
1764 * 4. associate the event channel with an interrupt.
1765 * 5. enable the interrupt.
1769 xnbp
->xnb_tx_ring_addr
= vmem_xalloc(heap_arena
, PAGESIZE
, PAGESIZE
,
1770 0, 0, 0, 0, VM_SLEEP
);
1771 ASSERT(xnbp
->xnb_tx_ring_addr
!= NULL
);
1774 map_op
.host_addr
= (uint64_t)((long)xnbp
->xnb_tx_ring_addr
);
1775 map_op
.flags
= GNTMAP_host_map
;
1776 map_op
.ref
= xnbp
->xnb_tx_ring_ref
;
1777 map_op
.dom
= xnbp
->xnb_peer
;
1778 hat_prepare_mapping(kas
.a_hat
, xnbp
->xnb_tx_ring_addr
, NULL
);
1779 if (xen_map_gref(GNTTABOP_map_grant_ref
, &map_op
, 1, B_FALSE
) != 0 ||
1780 map_op
.status
!= 0) {
1781 cmn_err(CE_WARN
, "xnb_connect_rings: cannot map tx-ring page.");
1784 xnbp
->xnb_tx_ring_handle
= map_op
.handle
;
1786 /* LINTED: constant in conditional context */
1787 BACK_RING_INIT(&xnbp
->xnb_tx_ring
,
1788 (netif_tx_sring_t
*)xnbp
->xnb_tx_ring_addr
, PAGESIZE
);
1791 xnbp
->xnb_rx_ring_addr
= vmem_xalloc(heap_arena
, PAGESIZE
, PAGESIZE
,
1792 0, 0, 0, 0, VM_SLEEP
);
1793 ASSERT(xnbp
->xnb_rx_ring_addr
!= NULL
);
1796 map_op
.host_addr
= (uint64_t)((long)xnbp
->xnb_rx_ring_addr
);
1797 map_op
.flags
= GNTMAP_host_map
;
1798 map_op
.ref
= xnbp
->xnb_rx_ring_ref
;
1799 map_op
.dom
= xnbp
->xnb_peer
;
1800 hat_prepare_mapping(kas
.a_hat
, xnbp
->xnb_rx_ring_addr
, NULL
);
1801 if (xen_map_gref(GNTTABOP_map_grant_ref
, &map_op
, 1, B_FALSE
) != 0 ||
1802 map_op
.status
!= 0) {
1803 cmn_err(CE_WARN
, "xnb_connect_rings: cannot map rx-ring page.");
1806 xnbp
->xnb_rx_ring_handle
= map_op
.handle
;
1808 /* LINTED: constant in conditional context */
1809 BACK_RING_INIT(&xnbp
->xnb_rx_ring
,
1810 (netif_rx_sring_t
*)xnbp
->xnb_rx_ring_addr
, PAGESIZE
);
1813 if (xvdi_bind_evtchn(dip
, xnbp
->xnb_fe_evtchn
) != DDI_SUCCESS
) {
1814 cmn_err(CE_WARN
, "xnb_connect_rings: "
1815 "cannot bind event channel %d", xnbp
->xnb_evtchn
);
1816 xnbp
->xnb_evtchn
= INVALID_EVTCHN
;
1819 xnbp
->xnb_evtchn
= xvdi_get_evtchn(dip
);
1822 * It would be good to set the state to XenbusStateConnected
1823 * here as well, but then what if ddi_add_intr() failed?
1824 * Changing the state in the store will be noticed by the peer
1825 * and cannot be "taken back".
1827 mutex_enter(&xnbp
->xnb_tx_lock
);
1828 mutex_enter(&xnbp
->xnb_rx_lock
);
1830 xnbp
->xnb_connected
= B_TRUE
;
1832 mutex_exit(&xnbp
->xnb_rx_lock
);
1833 mutex_exit(&xnbp
->xnb_tx_lock
);
1836 if (ddi_add_intr(dip
, 0, NULL
, NULL
, xnb_intr
, (caddr_t
)xnbp
)
1838 cmn_err(CE_WARN
, "xnb_connect_rings: cannot add interrupt");
1841 xnbp
->xnb_irq
= B_TRUE
;
1846 mutex_enter(&xnbp
->xnb_tx_lock
);
1847 mutex_enter(&xnbp
->xnb_rx_lock
);
1849 xnbp
->xnb_connected
= B_FALSE
;
1851 mutex_exit(&xnbp
->xnb_rx_lock
);
1852 mutex_exit(&xnbp
->xnb_tx_lock
);
1858 xnb_disconnect_rings(dev_info_t
*dip
)
1860 xnb_t
*xnbp
= ddi_get_driver_private(dip
);
1862 if (xnbp
->xnb_irq
) {
1863 ddi_remove_intr(dip
, 0, NULL
);
1864 xnbp
->xnb_irq
= B_FALSE
;
1867 if (xnbp
->xnb_evtchn
!= INVALID_EVTCHN
) {
1868 xvdi_free_evtchn(dip
);
1869 xnbp
->xnb_evtchn
= INVALID_EVTCHN
;
1872 if (xnbp
->xnb_rx_ring_handle
!= INVALID_GRANT_HANDLE
) {
1873 struct gnttab_unmap_grant_ref unmap_op
;
1875 unmap_op
.host_addr
= (uint64_t)(uintptr_t)
1876 xnbp
->xnb_rx_ring_addr
;
1877 unmap_op
.dev_bus_addr
= 0;
1878 unmap_op
.handle
= xnbp
->xnb_rx_ring_handle
;
1879 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref
,
1881 cmn_err(CE_WARN
, "xnb_disconnect_rings: "
1882 "cannot unmap rx-ring page (%d)",
1885 xnbp
->xnb_rx_ring_handle
= INVALID_GRANT_HANDLE
;
1888 if (xnbp
->xnb_rx_ring_addr
!= NULL
) {
1889 hat_release_mapping(kas
.a_hat
, xnbp
->xnb_rx_ring_addr
);
1890 vmem_free(heap_arena
, xnbp
->xnb_rx_ring_addr
, PAGESIZE
);
1891 xnbp
->xnb_rx_ring_addr
= NULL
;
1894 if (xnbp
->xnb_tx_ring_handle
!= INVALID_GRANT_HANDLE
) {
1895 struct gnttab_unmap_grant_ref unmap_op
;
1897 unmap_op
.host_addr
= (uint64_t)(uintptr_t)
1898 xnbp
->xnb_tx_ring_addr
;
1899 unmap_op
.dev_bus_addr
= 0;
1900 unmap_op
.handle
= xnbp
->xnb_tx_ring_handle
;
1901 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref
,
1903 cmn_err(CE_WARN
, "xnb_disconnect_rings: "
1904 "cannot unmap tx-ring page (%d)",
1907 xnbp
->xnb_tx_ring_handle
= INVALID_GRANT_HANDLE
;
1910 if (xnbp
->xnb_tx_ring_addr
!= NULL
) {
1911 hat_release_mapping(kas
.a_hat
, xnbp
->xnb_tx_ring_addr
);
1912 vmem_free(heap_arena
, xnbp
->xnb_tx_ring_addr
, PAGESIZE
);
1913 xnbp
->xnb_tx_ring_addr
= NULL
;
1918 xnb_oe_state_change(dev_info_t
*dip
, ddi_eventcookie_t id
,
1919 void *arg
, void *impl_data
)
1921 _NOTE(ARGUNUSED(id
, arg
));
1922 xnb_t
*xnbp
= ddi_get_driver_private(dip
);
1923 XenbusState new_state
= *(XenbusState
*)impl_data
;
1925 ASSERT(xnbp
!= NULL
);
1927 switch (new_state
) {
1928 case XenbusStateConnected
:
1929 /* spurious state change */
1930 if (xnbp
->xnb_connected
)
1933 if (!xnb_read_oe_config(xnbp
) ||
1934 !xnbp
->xnb_flavour
->xf_peer_connected(xnbp
)) {
1935 cmn_err(CE_WARN
, "xnb_oe_state_change: "
1936 "read otherend config error");
1937 (void) xvdi_switch_state(dip
, XBT_NULL
,
1939 (void) xvdi_post_event(dip
, XEN_HP_REMOVE
);
1945 mutex_enter(&xnbp
->xnb_state_lock
);
1946 xnbp
->xnb_fe_status
= XNB_STATE_READY
;
1947 if (xnbp
->xnb_be_status
== XNB_STATE_READY
)
1948 xnb_start_connect(xnbp
);
1949 mutex_exit(&xnbp
->xnb_state_lock
);
1952 * Now that we've attempted to connect it's reasonable
1953 * to allow an attempt to detach.
1955 xnbp
->xnb_detachable
= B_TRUE
;
1959 case XenbusStateClosing
:
1960 (void) xvdi_switch_state(dip
, XBT_NULL
, XenbusStateClosing
);
1964 case XenbusStateClosed
:
1965 xnbp
->xnb_flavour
->xf_peer_disconnected(xnbp
);
1967 mutex_enter(&xnbp
->xnb_tx_lock
);
1968 mutex_enter(&xnbp
->xnb_rx_lock
);
1970 xnb_disconnect_rings(dip
);
1971 xnbp
->xnb_connected
= B_FALSE
;
1973 mutex_exit(&xnbp
->xnb_rx_lock
);
1974 mutex_exit(&xnbp
->xnb_tx_lock
);
1976 (void) xvdi_switch_state(dip
, XBT_NULL
, XenbusStateClosed
);
1977 (void) xvdi_post_event(dip
, XEN_HP_REMOVE
);
1979 * In all likelyhood this is already set (in the above
1980 * case), but if the peer never attempted to connect
1981 * and the domain is destroyed we get here without
1982 * having been through the case above, so we set it to
1985 xnbp
->xnb_detachable
= B_TRUE
;
1995 xnb_hp_state_change(dev_info_t
*dip
, ddi_eventcookie_t id
,
1996 void *arg
, void *impl_data
)
1998 _NOTE(ARGUNUSED(id
, arg
));
1999 xnb_t
*xnbp
= ddi_get_driver_private(dip
);
2000 xendev_hotplug_state_t state
= *(xendev_hotplug_state_t
*)impl_data
;
2002 ASSERT(xnbp
!= NULL
);
2006 /* spurious hotplug event */
2007 if (xnbp
->xnb_hotplugged
)
2010 if (!xnb_read_xs_config(xnbp
))
2013 if (!xnbp
->xnb_flavour
->xf_hotplug_connected(xnbp
))
2016 mutex_enter(&xnbp
->xnb_tx_lock
);
2017 mutex_enter(&xnbp
->xnb_rx_lock
);
2019 xnbp
->xnb_hotplugged
= B_TRUE
;
2021 mutex_exit(&xnbp
->xnb_rx_lock
);
2022 mutex_exit(&xnbp
->xnb_tx_lock
);
2024 mutex_enter(&xnbp
->xnb_state_lock
);
2025 xnbp
->xnb_be_status
= XNB_STATE_READY
;
2026 if (xnbp
->xnb_fe_status
== XNB_STATE_READY
)
2027 xnb_start_connect(xnbp
);
2028 mutex_exit(&xnbp
->xnb_state_lock
);
2037 static struct modldrv modldrv
= {
2038 &mod_miscops
, "xnb",
2041 static struct modlinkage modlinkage
= {
2042 MODREV_1
, &modldrv
, NULL
2050 mutex_init(&xnb_alloc_page_lock
, NULL
, MUTEX_DRIVER
, NULL
);
2052 i
= mod_install(&modlinkage
);
2053 if (i
!= DDI_SUCCESS
)
2054 mutex_destroy(&xnb_alloc_page_lock
);
2060 _info(struct modinfo
*modinfop
)
2062 return (mod_info(&modlinkage
, modinfop
));
2070 i
= mod_remove(&modlinkage
);
2071 if (i
== DDI_SUCCESS
)
2072 mutex_destroy(&xnb_alloc_page_lock
);