Adding debian version 3.70~pre8+dfsg-1.
[syslinux-debian/hramrach.git] / gpxe / src / drivers / net / ipoib.c
blobd457b258a4095afe4bff41ea41dc1d336fdac5fe
1 /*
2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 #include <stdint.h>
20 #include <stdio.h>
21 #include <unistd.h>
22 #include <string.h>
23 #include <byteswap.h>
24 #include <errno.h>
25 #include <gpxe/if_arp.h>
26 #include <gpxe/iobuf.h>
27 #include <gpxe/netdevice.h>
28 #include <gpxe/infiniband.h>
29 #include <gpxe/ipoib.h>
31 /** @file
33 * IP over Infiniband
36 /** IPoIB MTU */
37 #define IPOIB_MTU 2048
39 /** Number of IPoIB data send work queue entries */
40 #define IPOIB_DATA_NUM_SEND_WQES 2
42 /** Number of IPoIB data receive work queue entries */
43 #define IPOIB_DATA_NUM_RECV_WQES 4
45 /** Number of IPoIB data completion entries */
46 #define IPOIB_DATA_NUM_CQES 8
48 /** Number of IPoIB metadata send work queue entries */
49 #define IPOIB_META_NUM_SEND_WQES 2
51 /** Number of IPoIB metadata receive work queue entries */
52 #define IPOIB_META_NUM_RECV_WQES 2
54 /** Number of IPoIB metadata completion entries */
55 #define IPOIB_META_NUM_CQES 8
57 /** An IPoIB queue set */
58 struct ipoib_queue_set {
59 /** Completion queue */
60 struct ib_completion_queue *cq;
61 /** Queue pair */
62 struct ib_queue_pair *qp;
63 /** Receive work queue fill level */
64 unsigned int recv_fill;
65 /** Receive work queue maximum fill level */
66 unsigned int recv_max_fill;
69 /** An IPoIB device */
70 struct ipoib_device {
71 /** Network device */
72 struct net_device *netdev;
73 /** Underlying Infiniband device */
74 struct ib_device *ibdev;
75 /** Data queue set */
76 struct ipoib_queue_set data;
77 /** Data queue set */
78 struct ipoib_queue_set meta;
79 /** Broadcast GID */
80 struct ib_gid broadcast_gid;
81 /** Broadcast LID */
82 unsigned int broadcast_lid;
83 /** Joined to broadcast group */
84 int broadcast_joined;
85 /** Data queue key */
86 unsigned long data_qkey;
89 /**
90 * IPoIB path cache entry
92 * This serves a similar role to the ARP cache for Ethernet. (ARP
93 * *is* used on IPoIB; we have two caches to maintain.)
95 struct ipoib_cached_path {
96 /** Destination GID */
97 struct ib_gid gid;
98 /** Destination LID */
99 unsigned int dlid;
100 /** Service level */
101 unsigned int sl;
102 /** Rate */
103 unsigned int rate;
106 /** Number of IPoIB path cache entries */
107 #define IPOIB_NUM_CACHED_PATHS 2
109 /** IPoIB path cache */
110 static struct ipoib_cached_path ipoib_path_cache[IPOIB_NUM_CACHED_PATHS];
112 /** Oldest IPoIB path cache entry index */
113 static unsigned int ipoib_path_cache_idx = 0;
115 /** TID half used to identify get path record replies */
116 #define IPOIB_TID_GET_PATH_REC 0x11111111UL
118 /** TID half used to identify multicast member record replies */
119 #define IPOIB_TID_MC_MEMBER_REC 0x22222222UL
121 /** IPoIB metadata TID */
122 static uint32_t ipoib_meta_tid = 0;
124 /** IPv4 broadcast GID */
125 static const struct ib_gid ipv4_broadcast_gid = {
126 { { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
127 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff } }
130 /** Maximum time we will wait for the broadcast join to succeed */
131 #define IPOIB_JOIN_MAX_DELAY_MS 1000
133 /****************************************************************************
135 * IPoIB link layer
137 ****************************************************************************
140 /** Broadcast QPN used in IPoIB MAC addresses
142 * This is a guaranteed invalid real QPN
144 #define IPOIB_BROADCAST_QPN 0xffffffffUL
146 /** Broadcast IPoIB address */
147 static struct ipoib_mac ipoib_broadcast = {
148 .qpn = ntohl ( IPOIB_BROADCAST_QPN ),
152 * Transmit IPoIB packet
154 * @v iobuf I/O buffer
155 * @v netdev Network device
156 * @v net_protocol Network-layer protocol
157 * @v ll_dest Link-layer destination address
159 * Prepends the IPoIB link-layer header and transmits the packet.
161 static int ipoib_tx ( struct io_buffer *iobuf, struct net_device *netdev,
162 struct net_protocol *net_protocol,
163 const void *ll_dest ) {
164 struct ipoib_hdr *ipoib_hdr =
165 iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
167 /* Build IPoIB header */
168 memcpy ( &ipoib_hdr->pseudo.peer, ll_dest,
169 sizeof ( ipoib_hdr->pseudo.peer ) );
170 ipoib_hdr->real.proto = net_protocol->net_proto;
171 ipoib_hdr->real.reserved = 0;
173 /* Hand off to network device */
174 return netdev_tx ( netdev, iobuf );
178 * Process received IPoIB packet
180 * @v iobuf I/O buffer
181 * @v netdev Network device
183 * Strips off the IPoIB link-layer header and passes up to the
184 * network-layer protocol.
186 static int ipoib_rx ( struct io_buffer *iobuf, struct net_device *netdev ) {
187 struct ipoib_hdr *ipoib_hdr = iobuf->data;
189 /* Sanity check */
190 if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
191 DBG ( "IPoIB packet too short for link-layer header\n" );
192 DBG_HD ( iobuf->data, iob_len ( iobuf ) );
193 free_iob ( iobuf );
194 return -EINVAL;
197 /* Strip off IPoIB header */
198 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
200 /* Hand off to network-layer protocol */
201 return net_rx ( iobuf, netdev, ipoib_hdr->real.proto,
202 &ipoib_hdr->pseudo.peer );
206 * Transcribe IPoIB address
208 * @v ll_addr Link-layer address
209 * @ret string Link-layer address in human-readable format
211 const char * ipoib_ntoa ( const void *ll_addr ) {
212 static char buf[45];
213 const struct ipoib_mac *mac = ll_addr;
215 snprintf ( buf, sizeof ( buf ), "%08lx:%08lx:%08lx:%08lx:%08lx",
216 htonl ( mac->qpn ), htonl ( mac->gid.u.dwords[0] ),
217 htonl ( mac->gid.u.dwords[1] ),
218 htonl ( mac->gid.u.dwords[2] ),
219 htonl ( mac->gid.u.dwords[3] ) );
220 return buf;
223 /** IPoIB protocol */
224 struct ll_protocol ipoib_protocol __ll_protocol = {
225 .name = "IPoIB",
226 .ll_proto = htons ( ARPHRD_INFINIBAND ),
227 .ll_addr_len = IPOIB_ALEN,
228 .ll_header_len = IPOIB_HLEN,
229 .ll_broadcast = ( uint8_t * ) &ipoib_broadcast,
230 .tx = ipoib_tx,
231 .rx = ipoib_rx,
232 .ntoa = ipoib_ntoa,
235 /****************************************************************************
237 * IPoIB network device
239 ****************************************************************************
243 * Destroy queue set
245 * @v ipoib IPoIB device
246 * @v qset Queue set
248 static void ipoib_destroy_qset ( struct ipoib_device *ipoib,
249 struct ipoib_queue_set *qset ) {
250 struct ib_device *ibdev = ipoib->ibdev;
252 if ( qset->qp )
253 ib_destroy_qp ( ibdev, qset->qp );
254 if ( qset->cq )
255 ib_destroy_cq ( ibdev, qset->cq );
256 memset ( qset, 0, sizeof ( *qset ) );
260 * Create queue set
262 * @v ipoib IPoIB device
263 * @v qset Queue set
264 * @ret rc Return status code
266 static int ipoib_create_qset ( struct ipoib_device *ipoib,
267 struct ipoib_queue_set *qset,
268 unsigned int num_cqes,
269 unsigned int num_send_wqes,
270 unsigned int num_recv_wqes,
271 unsigned long qkey ) {
272 struct ib_device *ibdev = ipoib->ibdev;
273 int rc;
275 /* Store queue parameters */
276 qset->recv_max_fill = num_recv_wqes;
278 /* Allocate completion queue */
279 qset->cq = ib_create_cq ( ibdev, num_cqes );
280 if ( ! qset->cq ) {
281 DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
282 ipoib );
283 rc = -ENOMEM;
284 goto err;
287 /* Allocate queue pair */
288 qset->qp = ib_create_qp ( ibdev, num_send_wqes, qset->cq,
289 num_recv_wqes, qset->cq, qkey );
290 if ( ! qset->qp ) {
291 DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
292 ipoib );
293 rc = -ENOMEM;
294 goto err;
296 ib_qp_set_ownerdata ( qset->qp, ipoib->netdev );
298 return 0;
300 err:
301 ipoib_destroy_qset ( ipoib, qset );
302 return rc;
306 * Find path cache entry by GID
308 * @v gid GID
309 * @ret entry Path cache entry, or NULL
311 static struct ipoib_cached_path *
312 ipoib_find_cached_path ( struct ib_gid *gid ) {
313 struct ipoib_cached_path *path;
314 unsigned int i;
316 for ( i = 0 ; i < IPOIB_NUM_CACHED_PATHS ; i++ ) {
317 path = &ipoib_path_cache[i];
318 if ( memcmp ( &path->gid, gid, sizeof ( *gid ) ) == 0 )
319 return path;
321 DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx cache miss\n",
322 htonl ( gid->u.dwords[0] ), htonl ( gid->u.dwords[1] ),
323 htonl ( gid->u.dwords[2] ), htonl ( gid->u.dwords[3] ) );
324 return NULL;
328 * Transmit path record request
330 * @v ipoib IPoIB device
331 * @v gid Destination GID
332 * @ret rc Return status code
334 static int ipoib_get_path_record ( struct ipoib_device *ipoib,
335 struct ib_gid *gid ) {
336 struct ib_device *ibdev = ipoib->ibdev;
337 struct io_buffer *iobuf;
338 struct ib_mad_path_record *path_record;
339 struct ib_address_vector av;
340 int rc;
342 /* Allocate I/O buffer */
343 iobuf = alloc_iob ( sizeof ( *path_record ) );
344 if ( ! iobuf )
345 return -ENOMEM;
346 iob_put ( iobuf, sizeof ( *path_record ) );
347 path_record = iobuf->data;
348 memset ( path_record, 0, sizeof ( *path_record ) );
350 /* Construct path record request */
351 path_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
352 path_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
353 path_record->mad_hdr.class_version = 2;
354 path_record->mad_hdr.method = IB_MGMT_METHOD_GET;
355 path_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_PATH_REC );
356 path_record->mad_hdr.tid[0] = IPOIB_TID_GET_PATH_REC;
357 path_record->mad_hdr.tid[1] = ipoib_meta_tid++;
358 path_record->sa_hdr.comp_mask[1] =
359 htonl ( IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID );
360 memcpy ( &path_record->dgid, gid, sizeof ( path_record->dgid ) );
361 memcpy ( &path_record->sgid, &ibdev->port_gid,
362 sizeof ( path_record->sgid ) );
364 /* Construct address vector */
365 memset ( &av, 0, sizeof ( av ) );
366 av.dlid = ibdev->sm_lid;
367 av.dest_qp = IB_SA_QPN;
368 av.qkey = IB_GLOBAL_QKEY;
370 /* Post send request */
371 if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
372 iobuf ) ) != 0 ) {
373 DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
374 ipoib, strerror ( rc ) );
375 free_iob ( iobuf );
376 return rc;
379 return 0;
383 * Transmit multicast group membership request
385 * @v ipoib IPoIB device
386 * @v gid Multicast GID
387 * @v join Join (rather than leave) group
388 * @ret rc Return status code
390 static int ipoib_mc_member_record ( struct ipoib_device *ipoib,
391 struct ib_gid *gid, int join ) {
392 struct ib_device *ibdev = ipoib->ibdev;
393 struct io_buffer *iobuf;
394 struct ib_mad_mc_member_record *mc_member_record;
395 struct ib_address_vector av;
396 int rc;
398 /* Allocate I/O buffer */
399 iobuf = alloc_iob ( sizeof ( *mc_member_record ) );
400 if ( ! iobuf )
401 return -ENOMEM;
402 iob_put ( iobuf, sizeof ( *mc_member_record ) );
403 mc_member_record = iobuf->data;
404 memset ( mc_member_record, 0, sizeof ( *mc_member_record ) );
406 /* Construct path record request */
407 mc_member_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
408 mc_member_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
409 mc_member_record->mad_hdr.class_version = 2;
410 mc_member_record->mad_hdr.method =
411 ( join ? IB_MGMT_METHOD_SET : IB_MGMT_METHOD_DELETE );
412 mc_member_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_MC_MEMBER_REC );
413 mc_member_record->mad_hdr.tid[0] = IPOIB_TID_MC_MEMBER_REC;
414 mc_member_record->mad_hdr.tid[1] = ipoib_meta_tid++;
415 mc_member_record->sa_hdr.comp_mask[1] =
416 htonl ( IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
417 IB_SA_MCMEMBER_REC_JOIN_STATE );
418 mc_member_record->scope__join_state = 1;
419 memcpy ( &mc_member_record->mgid, gid,
420 sizeof ( mc_member_record->mgid ) );
421 memcpy ( &mc_member_record->port_gid, &ibdev->port_gid,
422 sizeof ( mc_member_record->port_gid ) );
424 /* Construct address vector */
425 memset ( &av, 0, sizeof ( av ) );
426 av.dlid = ibdev->sm_lid;
427 av.dest_qp = IB_SA_QPN;
428 av.qkey = IB_GLOBAL_QKEY;
430 /* Post send request */
431 if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
432 iobuf ) ) != 0 ) {
433 DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
434 ipoib, strerror ( rc ) );
435 free_iob ( iobuf );
436 return rc;
439 return 0;
443 * Transmit packet via IPoIB network device
445 * @v netdev Network device
446 * @v iobuf I/O buffer
447 * @ret rc Return status code
449 static int ipoib_transmit ( struct net_device *netdev,
450 struct io_buffer *iobuf ) {
451 struct ipoib_device *ipoib = netdev->priv;
452 struct ib_device *ibdev = ipoib->ibdev;
453 struct ipoib_pseudo_hdr *ipoib_pshdr = iobuf->data;
454 struct ib_address_vector av;
455 struct ib_gid *gid;
456 struct ipoib_cached_path *path;
457 int rc;
459 /* Sanity check */
460 if ( iob_len ( iobuf ) < sizeof ( *ipoib_pshdr ) ) {
461 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
462 return -EINVAL;
464 iob_pull ( iobuf, ( sizeof ( *ipoib_pshdr ) ) );
466 /* Construct address vector */
467 memset ( &av, 0, sizeof ( av ) );
468 av.qkey = IB_GLOBAL_QKEY;
469 av.gid_present = 1;
470 if ( ipoib_pshdr->peer.qpn == htonl ( IPOIB_BROADCAST_QPN ) ) {
471 /* Broadcast address */
472 av.dest_qp = IB_BROADCAST_QPN;
473 av.dlid = ipoib->broadcast_lid;
474 gid = &ipoib->broadcast_gid;
475 } else {
476 /* Unicast - look in path cache */
477 path = ipoib_find_cached_path ( &ipoib_pshdr->peer.gid );
478 if ( ! path ) {
479 /* No path entry - get path record */
480 rc = ipoib_get_path_record ( ipoib,
481 &ipoib_pshdr->peer.gid );
482 netdev_tx_complete ( netdev, iobuf );
483 return rc;
485 av.dest_qp = ntohl ( ipoib_pshdr->peer.qpn );
486 av.dlid = path->dlid;
487 av.rate = path->rate;
488 av.sl = path->sl;
489 gid = &ipoib_pshdr->peer.gid;
491 memcpy ( &av.gid, gid, sizeof ( av.gid ) );
493 return ib_post_send ( ibdev, ipoib->data.qp, &av, iobuf );
497 * Handle IPoIB data send completion
499 * @v ibdev Infiniband device
500 * @v qp Queue pair
501 * @v completion Completion
502 * @v iobuf I/O buffer
504 static void ipoib_data_complete_send ( struct ib_device *ibdev __unused,
505 struct ib_queue_pair *qp,
506 struct ib_completion *completion,
507 struct io_buffer *iobuf ) {
508 struct net_device *netdev = ib_qp_get_ownerdata ( qp );
510 netdev_tx_complete_err ( netdev, iobuf,
511 ( completion->syndrome ? -EIO : 0 ) );
515 * Handle IPoIB data receive completion
517 * @v ibdev Infiniband device
518 * @v qp Queue pair
519 * @v completion Completion
520 * @v iobuf I/O buffer
522 static void ipoib_data_complete_recv ( struct ib_device *ibdev __unused,
523 struct ib_queue_pair *qp,
524 struct ib_completion *completion,
525 struct io_buffer *iobuf ) {
526 struct net_device *netdev = ib_qp_get_ownerdata ( qp );
527 struct ipoib_device *ipoib = netdev->priv;
528 struct ipoib_pseudo_hdr *ipoib_pshdr;
530 if ( completion->syndrome ) {
531 netdev_rx_err ( netdev, iobuf, -EIO );
532 goto done;
535 iob_put ( iobuf, completion->len );
536 if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
537 DBGC ( ipoib, "IPoIB %p received data packet too short to "
538 "contain GRH\n", ipoib );
539 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
540 netdev_rx_err ( netdev, iobuf, -EIO );
541 goto done;
543 iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
545 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_real_hdr ) ) {
546 DBGC ( ipoib, "IPoIB %p received data packet too short to "
547 "contain IPoIB header\n", ipoib );
548 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
549 netdev_rx_err ( netdev, iobuf, -EIO );
550 goto done;
553 ipoib_pshdr = iob_push ( iobuf, sizeof ( *ipoib_pshdr ) );
554 /* FIXME: fill in a MAC address for the sake of AoE! */
556 netdev_rx ( netdev, iobuf );
558 done:
559 ipoib->data.recv_fill--;
563 * Handle IPoIB metadata send completion
565 * @v ibdev Infiniband device
566 * @v qp Queue pair
567 * @v completion Completion
568 * @v iobuf I/O buffer
570 static void ipoib_meta_complete_send ( struct ib_device *ibdev __unused,
571 struct ib_queue_pair *qp,
572 struct ib_completion *completion,
573 struct io_buffer *iobuf ) {
574 struct net_device *netdev = ib_qp_get_ownerdata ( qp );
575 struct ipoib_device *ipoib = netdev->priv;
577 if ( completion->syndrome ) {
578 DBGC ( ipoib, "IPoIB %p metadata TX completion error %x\n",
579 ipoib, completion->syndrome );
581 free_iob ( iobuf );
585 * Handle received IPoIB path record
587 * @v ipoib IPoIB device
588 * @v path_record Path record
590 static void ipoib_recv_path_record ( struct ipoib_device *ipoib __unused,
591 struct ib_mad_path_record *path_record ) {
592 struct ipoib_cached_path *path;
594 /* Update path cache entry */
595 path = &ipoib_path_cache[ipoib_path_cache_idx];
596 memcpy ( &path->gid, &path_record->dgid, sizeof ( path->gid ) );
597 path->dlid = ntohs ( path_record->dlid );
598 path->sl = ( path_record->reserved__sl & 0x0f );
599 path->rate = ( path_record->rate_selector__rate & 0x3f );
601 DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx dlid %x sl %x rate %x\n",
602 htonl ( path->gid.u.dwords[0] ), htonl ( path->gid.u.dwords[1] ),
603 htonl ( path->gid.u.dwords[2] ), htonl ( path->gid.u.dwords[3] ),
604 path->dlid, path->sl, path->rate );
606 /* Update path cache index */
607 ipoib_path_cache_idx++;
608 if ( ipoib_path_cache_idx == IPOIB_NUM_CACHED_PATHS )
609 ipoib_path_cache_idx = 0;
613 * Handle received IPoIB multicast membership record
615 * @v ipoib IPoIB device
616 * @v mc_member_record Multicast membership record
618 static void ipoib_recv_mc_member_record ( struct ipoib_device *ipoib,
619 struct ib_mad_mc_member_record *mc_member_record ) {
620 /* Record parameters */
621 ipoib->broadcast_joined =
622 ( mc_member_record->scope__join_state & 0x0f );
623 ipoib->data_qkey = ntohl ( mc_member_record->qkey );
624 ipoib->broadcast_lid = ntohs ( mc_member_record->mlid );
625 DBGC ( ipoib, "IPoIB %p %s broadcast group: qkey %lx mlid %x\n",
626 ipoib, ( ipoib->broadcast_joined ? "joined" : "left" ),
627 ipoib->data_qkey, ipoib->broadcast_lid );
631 * Handle IPoIB metadata receive completion
633 * @v ibdev Infiniband device
634 * @v qp Queue pair
635 * @v completion Completion
636 * @v iobuf I/O buffer
638 static void ipoib_meta_complete_recv ( struct ib_device *ibdev __unused,
639 struct ib_queue_pair *qp,
640 struct ib_completion *completion,
641 struct io_buffer *iobuf ) {
642 struct net_device *netdev = ib_qp_get_ownerdata ( qp );
643 struct ipoib_device *ipoib = netdev->priv;
644 union ib_mad *mad;
646 if ( completion->syndrome ) {
647 DBGC ( ipoib, "IPoIB %p metadata RX completion error %x\n",
648 ipoib, completion->syndrome );
649 goto done;
652 iob_put ( iobuf, completion->len );
653 if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
654 DBGC ( ipoib, "IPoIB %p received metadata packet too short "
655 "to contain GRH\n", ipoib );
656 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
657 goto done;
659 iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
660 if ( iob_len ( iobuf ) < sizeof ( *mad ) ) {
661 DBGC ( ipoib, "IPoIB %p received metadata packet too short "
662 "to contain reply\n", ipoib );
663 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
664 goto done;
666 mad = iobuf->data;
668 if ( mad->mad_hdr.status != 0 ) {
669 DBGC ( ipoib, "IPoIB %p metadata RX err status %04x\n",
670 ipoib, ntohs ( mad->mad_hdr.status ) );
671 goto done;
674 switch ( mad->mad_hdr.tid[0] ) {
675 case IPOIB_TID_GET_PATH_REC:
676 ipoib_recv_path_record ( ipoib, &mad->path_record );
677 break;
678 case IPOIB_TID_MC_MEMBER_REC:
679 ipoib_recv_mc_member_record ( ipoib, &mad->mc_member_record );
680 break;
681 default:
682 DBGC ( ipoib, "IPoIB %p unwanted response:\n",
683 ipoib );
684 DBGC_HD ( ipoib, mad, sizeof ( *mad ) );
685 break;
688 done:
689 ipoib->meta.recv_fill--;
690 free_iob ( iobuf );
694 * Refill IPoIB receive ring
696 * @v ipoib IPoIB device
698 static void ipoib_refill_recv ( struct ipoib_device *ipoib,
699 struct ipoib_queue_set *qset ) {
700 struct ib_device *ibdev = ipoib->ibdev;
701 struct io_buffer *iobuf;
702 int rc;
704 while ( qset->recv_fill < qset->recv_max_fill ) {
705 iobuf = alloc_iob ( IPOIB_MTU );
706 if ( ! iobuf )
707 break;
708 if ( ( rc = ib_post_recv ( ibdev, qset->qp, iobuf ) ) != 0 ) {
709 free_iob ( iobuf );
710 break;
712 qset->recv_fill++;
717 * Poll IPoIB network device
719 * @v netdev Network device
721 static void ipoib_poll ( struct net_device *netdev ) {
722 struct ipoib_device *ipoib = netdev->priv;
723 struct ib_device *ibdev = ipoib->ibdev;
725 ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
726 ipoib_meta_complete_recv );
727 ib_poll_cq ( ibdev, ipoib->data.cq, ipoib_data_complete_send,
728 ipoib_data_complete_recv );
729 ipoib_refill_recv ( ipoib, &ipoib->meta );
730 ipoib_refill_recv ( ipoib, &ipoib->data );
734 * Enable/disable interrupts on IPoIB network device
736 * @v netdev Network device
737 * @v enable Interrupts should be enabled
739 static void ipoib_irq ( struct net_device *netdev __unused,
740 int enable __unused ) {
741 /* No implementation */
745 * Open IPoIB network device
747 * @v netdev Network device
748 * @ret rc Return status code
750 static int ipoib_open ( struct net_device *netdev ) {
751 struct ipoib_device *ipoib = netdev->priv;
752 struct ib_device *ibdev = ipoib->ibdev;
753 int rc;
755 /* Attach to broadcast multicast GID */
756 if ( ( rc = ib_mcast_attach ( ibdev, ipoib->data.qp,
757 &ipoib->broadcast_gid ) ) != 0 ) {
758 DBG ( "Could not attach to broadcast GID: %s\n",
759 strerror ( rc ) );
760 return rc;
763 /* Fill receive rings */
764 ipoib_refill_recv ( ipoib, &ipoib->meta );
765 ipoib_refill_recv ( ipoib, &ipoib->data );
767 return 0;
771 * Close IPoIB network device
773 * @v netdev Network device
775 static void ipoib_close ( struct net_device *netdev ) {
776 struct ipoib_device *ipoib = netdev->priv;
777 struct ib_device *ibdev = ipoib->ibdev;
779 /* Detach from broadcast multicast GID */
780 ib_mcast_detach ( ibdev, ipoib->data.qp, &ipoib->broadcast_gid );
782 /* FIXME: should probably flush the receive ring */
785 /** IPoIB network device operations */
786 static struct net_device_operations ipoib_operations = {
787 .open = ipoib_open,
788 .close = ipoib_close,
789 .transmit = ipoib_transmit,
790 .poll = ipoib_poll,
791 .irq = ipoib_irq,
795 * Join IPoIB broadcast group
797 * @v ipoib IPoIB device
798 * @ret rc Return status code
800 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
801 struct ib_device *ibdev = ipoib->ibdev;
802 unsigned int delay_ms;
803 int rc;
805 /* Make sure we have some receive descriptors */
806 ipoib_refill_recv ( ipoib, &ipoib->meta );
808 /* Send join request */
809 if ( ( rc = ipoib_mc_member_record ( ipoib, &ipoib->broadcast_gid,
810 1 ) ) != 0 ) {
811 DBGC ( ipoib, "IPoIB %p could not send broadcast join: %s\n",
812 ipoib, strerror ( rc ) );
813 return rc;
816 /* Wait for join to complete. Ideally we wouldn't delay for
817 * this long, but we need the queue key before we can set up
818 * the data queue pair, which we need before we can know the
819 * MAC address.
821 for ( delay_ms = IPOIB_JOIN_MAX_DELAY_MS ; delay_ms ; delay_ms-- ) {
822 mdelay ( 1 );
823 ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
824 ipoib_meta_complete_recv );
825 ipoib_refill_recv ( ipoib, &ipoib->meta );
826 if ( ipoib->broadcast_joined )
827 return 0;
829 DBGC ( ipoib, "IPoIB %p timed out waiting for broadcast join\n",
830 ipoib );
832 return -ETIMEDOUT;
836 * Probe IPoIB device
838 * @v ibdev Infiniband device
839 * @ret rc Return status code
841 int ipoib_probe ( struct ib_device *ibdev ) {
842 struct net_device *netdev;
843 struct ipoib_device *ipoib;
844 struct ipoib_mac *mac;
845 int rc;
847 /* Allocate network device */
848 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
849 if ( ! netdev )
850 return -ENOMEM;
851 netdev_init ( netdev, &ipoib_operations );
852 ipoib = netdev->priv;
853 ib_set_ownerdata ( ibdev, netdev );
854 netdev->dev = ibdev->dev;
855 memset ( ipoib, 0, sizeof ( *ipoib ) );
856 ipoib->netdev = netdev;
857 ipoib->ibdev = ibdev;
859 /* Calculate broadcast GID */
860 memcpy ( &ipoib->broadcast_gid, &ipv4_broadcast_gid,
861 sizeof ( ipoib->broadcast_gid ) );
862 ipoib->broadcast_gid.u.words[2] = htons ( ibdev->pkey );
864 /* Allocate metadata queue set */
865 if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->meta,
866 IPOIB_META_NUM_CQES,
867 IPOIB_META_NUM_SEND_WQES,
868 IPOIB_META_NUM_RECV_WQES,
869 IB_GLOBAL_QKEY ) ) != 0 ) {
870 DBGC ( ipoib, "IPoIB %p could not allocate metadata QP: %s\n",
871 ipoib, strerror ( rc ) );
872 goto err_create_meta_qset;
875 /* Join broadcast group */
876 if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
877 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
878 ipoib, strerror ( rc ) );
879 goto err_join_broadcast_group;
882 /* Allocate data queue set */
883 if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->data,
884 IPOIB_DATA_NUM_CQES,
885 IPOIB_DATA_NUM_SEND_WQES,
886 IPOIB_DATA_NUM_RECV_WQES,
887 ipoib->data_qkey ) ) != 0 ) {
888 DBGC ( ipoib, "IPoIB %p could not allocate data QP: %s\n",
889 ipoib, strerror ( rc ) );
890 goto err_create_data_qset;
893 /* Construct MAC address */
894 mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
895 mac->qpn = htonl ( ipoib->data.qp->qpn );
896 memcpy ( &mac->gid, &ibdev->port_gid, sizeof ( mac->gid ) );
898 /* Register network device */
899 if ( ( rc = register_netdev ( netdev ) ) != 0 )
900 goto err_register_netdev;
902 return 0;
904 err_register_netdev:
905 ipoib_destroy_qset ( ipoib, &ipoib->data );
906 err_join_broadcast_group:
907 err_create_data_qset:
908 ipoib_destroy_qset ( ipoib, &ipoib->meta );
909 err_create_meta_qset:
910 netdev_nullify ( netdev );
911 netdev_put ( netdev );
912 return rc;
916 * Remove IPoIB device
918 * @v ibdev Infiniband device
920 void ipoib_remove ( struct ib_device *ibdev ) {
921 struct net_device *netdev = ib_get_ownerdata ( ibdev );
922 struct ipoib_device *ipoib = netdev->priv;
924 unregister_netdev ( netdev );
925 ipoib_destroy_qset ( ipoib, &ipoib->data );
926 ipoib_destroy_qset ( ipoib, &ipoib->meta );
927 netdev_nullify ( netdev );
928 netdev_put ( netdev );