4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #pragma ident "%Z%%M% %I% %E% SMI"
29 #include <dhcp_impl.h>
30 #include <sys/types.h>
31 #include <socket_impl.h>
32 #include <socket_inet.h>
34 #include <sys/socket.h>
36 #include <net/if_arp.h>
37 #include <netinet/in_systm.h>
38 #include <netinet/in.h>
39 #include <netinet/ip.h>
40 #include <netinet/if_ether.h>
41 #include <sys/promif.h>
42 #include <sys/prom_plat.h>
43 #include <sys/salib.h>
44 #include <sys/bootdebug.h>
45 #include <sys/ib/clients/ibd/ibd.h>
49 #include "ipv4_impl.h"
55 struct arphdr ea_hdr
; /* fixed-size header */
56 ipoib_mac_t arp_sha
; /* sender hardware address */
57 uchar_t arp_spa
[4]; /* sender protocol address */
58 ipoib_mac_t arp_tha
; /* target hardware address */
59 uchar_t arp_tpa
[4]; /* target protocol address */
63 ipoib_mac_t ibdbroadcastaddr
;
66 * Assumptions about OBP behavior (refer FWARC 2002/702, 2003/251):
67 * 1. prom_write() accepts the 20 byte destination address as the
68 * first component in the send buffer. The buffer pointer points
69 * to the start of this 20 byte address. The length parameter is
70 * the IPoIB datagram size with the 20 byte of destination
72 * 2. OBP will not provide max-frame-size, since obp can only
73 * determine that by querying the IBA mcg, and thus the property
74 * has to be /chosen:ipib-frame-size. This will refer to the IPoIB
75 * link MTU as per section 4.0 of ietf i/d, ie, the 4 byte IPoIB
76 * header plus the IP payload mtu. Plus the 20 bytes of addressing
78 * 3. OBP will not provide mac-address property for IPoIB since there
79 * are built in assumptions about 6 byte address with that. Instead,
80 * /chosen:ipib-address will provide the local address.
81 * 4. prom_read() returns 20 byte 0'ed filler followed by 4 byte
82 * IPoIB header followed by IP payload. The return value is -2,
83 * -1, 0, or the length of the received IPoIB datagram alongwith
84 * the 20 bytes MBZ. The buffer pointer points to the start of
85 * the 20 MBZ bytes. The length parameter reflects the max data
86 * size that should be copied into the buffer including the 20
88 * 5. OBP will not provide chosen-network-type, only
89 * network-interface-type = ipib. On an Infiniband device, this
90 * however does not guarantee that it is a network device.
91 * 6. OBP will provide the DHCP client id in /chosen:client-id.
92 * 7. /chosen:ipib-broadcast will provide the broadcast address.
93 * 8. OBP will validate that RARP is not being used before
94 * allowing boot to proceed to inetboot.
98 ipoib_ptxhdr_t arp_eh
;
99 struct ibd_arp arp_ea
;
102 #define dprintf if (boothowto & RB_DEBUG) printf
105 ibd_print(ipoib_mac_t
*ea
)
107 unsigned char *macaddr
= (unsigned char *)ea
;
108 static char pbuf
[(3 * IPOIB_ADDRL
) + 1];
112 ptr
= pbuf
+ sprintf(pbuf
, "%x", *macaddr
++);
113 for (i
= 0; i
< (IPOIB_ADDRL
- 1); i
++)
114 ptr
+= sprintf(ptr
, ":%x", *macaddr
++);
120 * Common ARP code. Broadcast the packet and wait for the right response.
122 * If arp is called for, caller expects a hardware address in the
123 * source hardware address (sha) field of the "out" argument.
125 * IPoIB does not support RARP (see ibd_revarp()).
127 * Returns TRUE if transaction succeeded, FALSE otherwise.
129 * The timeout argument is the number of milliseconds to wait for a
130 * response. An infinite timeout can be specified as 0xffffffff.
133 ibd_comarp(struct arp_packet
*out
, uint32_t timeout
)
135 struct arp_packet
*in
= (struct arp_packet
*)mac_state
.mac_buf
;
136 int count
, time
, feedback
, len
, delay
= 2;
138 struct in_addr tmp_ia
;
141 bcopy((caddr_t
)&ibdbroadcastaddr
, (caddr_t
)&out
->arp_eh
.ipoib_dest
,
144 out
->arp_ea
.arp_hrd
= htons(ARPHRD_IB
);
145 out
->arp_ea
.arp_pro
= htons(ETHERTYPE_IP
);
146 out
->arp_ea
.arp_hln
= IPOIB_ADDRL
;
147 out
->arp_ea
.arp_pln
= sizeof (struct in_addr
);
148 bcopy(mac_state
.mac_addr_buf
, (caddr_t
)&out
->arp_ea
.arp_sha
,
150 ipv4_getipaddr(&tmp_ia
);
151 tmp_ia
.s_addr
= htonl(tmp_ia
.s_addr
);
152 bcopy((caddr_t
)&tmp_ia
, (caddr_t
)out
->arp_ea
.arp_spa
,
153 sizeof (struct in_addr
));
156 wait_time
= prom_gettime() + timeout
;
157 for (count
= 0; timeout
== ~0U || prom_gettime() < wait_time
; count
++) {
158 if (count
== IBD_WAITCNT
) {
160 * Since IPoIB does not support RARP (see ibd_revarp),
161 * we know that out->arp_ea.arp_op == ARPOP_REQUEST.
163 bcopy((caddr_t
)out
->arp_ea
.arp_tpa
,
164 (caddr_t
)&tmp_ia
, sizeof (struct in_addr
));
165 printf("\nRequesting MAC address for: %s\n",
169 (void) prom_write(mac_state
.mac_dev
, (caddr_t
)out
,
170 sizeof (*out
), 0, NETWORK
);
172 if (count
>= IBD_WAITCNT
)
173 printf("%c\b", ind
[feedback
++ % 4]); /* activity */
175 time
= prom_gettime() + (delay
* 1000); /* broadcast delay */
176 while (prom_gettime() <= time
) {
177 len
= prom_read(mac_state
.mac_dev
, mac_state
.mac_buf
,
178 mac_state
.mac_mtu
, 0, NETWORK
);
179 if (len
< sizeof (struct arp_packet
))
181 if (in
->arp_ea
.arp_pro
!= ntohs(ETHERTYPE_IP
))
184 * Since IPoIB does not support RARP (see ibd_revarp),
185 * we know that out->arp_ea.arp_op == ARPOP_REQUEST.
187 if (in
->arp_eh
.ipoib_rhdr
.ipoib_type
!=
188 ntohs(ETHERTYPE_ARP
))
190 if (in
->arp_ea
.arp_op
!= ntohs(ARPOP_REPLY
))
192 if (bcmp((caddr_t
)in
->arp_ea
.arp_spa
,
193 (caddr_t
)out
->arp_ea
.arp_tpa
,
194 sizeof (struct in_addr
)) != 0)
196 if (boothowto
& RB_VERBOSE
) {
197 bcopy((caddr_t
)in
->arp_ea
.arp_spa
,
199 sizeof (struct in_addr
));
200 printf("Found %s @ %s\n",
202 ibd_print(&in
->arp_ea
.arp_sha
));
204 /* copy hardware addr into "out" for caller */
205 bcopy((caddr_t
)&in
->arp_ea
.arp_sha
,
206 (caddr_t
)&out
->arp_ea
.arp_sha
, IPOIB_ADDRL
);
210 delay
= delay
* 2; /* Double the request delay */
211 if (delay
> 64) /* maximum delay is 64 seconds */
219 * Broadcasts to determine MAC address given network order IP address.
222 * Returns TRUE if successful, FALSE otherwise.
225 ibd_arp(struct in_addr
*ip
, void *hap
, uint32_t timeout
)
227 ipoib_mac_t
*ep
= (ipoib_mac_t
*)hap
;
228 struct arp_packet out
;
232 prom_panic("IPoIB device is not initialized.");
234 bzero((char *)&out
, sizeof (struct arp_packet
));
236 out
.arp_eh
.ipoib_rhdr
.ipoib_type
= htons(ETHERTYPE_ARP
);
237 out
.arp_ea
.arp_op
= htons(ARPOP_REQUEST
);
238 bcopy((caddr_t
)&ibdbroadcastaddr
, (caddr_t
)&out
.arp_ea
.arp_tha
,
240 bcopy((caddr_t
)ip
, (caddr_t
)out
.arp_ea
.arp_tpa
,
241 sizeof (struct in_addr
));
243 result
= ibd_comarp(&out
, timeout
);
245 if (result
&& (ep
!= NULL
)) {
246 bcopy((caddr_t
)&out
.arp_ea
.arp_sha
, (caddr_t
)ep
, IPOIB_ADDRL
);
252 * Reverse ARP client side
253 * Determine our Internet address given our MAC address
259 prom_panic("IPoIB can not boot with RARP.");
264 ibd_header_len(struct inetgram
*igm
)
267 * We indicate to upper layers to leave enough space
268 * in output buffers for filling in the IPoIB header
269 * and the 20 byte destination address in ibd_output().
271 return (IPOIB_HDRSIZE
+ IPOIB_ADDRL
);
275 * Handle a IP datagram addressed to our MAC address or to the link
276 * layer broadcast address. Also respond to ARP requests. Generates
277 * inetgrams as long as there's data and the mac level IP timeout timer
278 * hasn't expired. As soon as there is no data, we try for
279 * IBD_INPUT_ATTEMPTS for more, then exit the loop, even if there is time
280 * left, since we expect to have data waiting for us when we're called, we just
281 * don't know how much.
283 * We workaround slow proms (some proms have hard sleeps for as much as 3msec)
284 * even though there are is data waiting.
286 * Returns the total number of MEDIA_LVL frames placed on the socket.
287 * Caller is expected to free up the inetgram resources.
292 struct inetgram
*inp
;
294 int frames
= 0; /* successful frames */
295 int attempts
= 0; /* failed attempts after success */
296 int16_t len
= 0, data_len
;
297 uint32_t timeout
, reltime
;
298 uint32_t pre_pr
, post_pr
; /* prom_read interval */
301 int failures
= 0; /* total failures */
302 int total_attempts
= 0; /* total prom_read */
303 int no_data
= 0; /* no data in prom */
304 int arps
= 0; /* arp requests processed */
305 uint32_t tot_pr
= 0; /* prom_read time */
306 uint32_t tot_pc
= 0; /* inetgram creation time */
312 prom_panic("IPoIB device is not initialized.");
314 if ((reltime
= sockets
[index
].in_timeout
) == 0)
315 reltime
= mac_state
.mac_in_timeout
;
316 timeout
= prom_gettime() + reltime
;
319 if (frames
> IBD_MAX_FRAMES
) {
320 /* someone is trying a denial of service attack */
325 * The following is being paranoid about possible bugs
326 * where prom_read() returns a nonzero length, even when
327 * it's not read a packet; it zeroes out the header to
328 * compensate. Paranoia from calvin prom (V2) days.
330 bzero(mac_state
.mac_buf
, sizeof (ipoib_ptxhdr_t
));
333 * Prom_read() will return 0 or -2 if no data is present. A
334 * return value of -1 means an error has occurred. We adjust
335 * the timeout by calling the time spent in prom_read() "free".
336 * prom_read() returns the number of bytes actually read, but
337 * will only copy "len" bytes into our buffer. Adjust in
338 * case the MTU is wrong.
340 pre_pr
= prom_gettime();
341 len
= prom_read(mac_state
.mac_dev
, mac_state
.mac_buf
,
342 mac_state
.mac_mtu
, 0, NETWORK
);
343 post_pr
= prom_gettime();
344 timeout
+= (post_pr
- pre_pr
);
346 tot_pr
+= (post_pr
- pre_pr
);
350 if (len
> mac_state
.mac_mtu
) {
351 dprintf("ibd_input: adjusting MTU %d -> %d\n",
352 mac_state
.mac_mtu
, len
);
353 bkmem_free(mac_state
.mac_buf
, mac_state
.mac_mtu
);
354 mac_state
.mac_mtu
= len
;
355 mac_state
.mac_buf
= bkmem_alloc(mac_state
.mac_mtu
);
356 if (mac_state
.mac_buf
== NULL
) {
357 prom_panic("ibd_input: Cannot reallocate "
360 len
= 0; /* pretend there was no data */
369 if (len
== 0 || len
== -2) {
378 eh
= (ipoib_ptxhdr_t
*)mac_state
.mac_buf
;
379 if (eh
->ipoib_rhdr
.ipoib_type
== ntohs(ETHERTYPE_IP
) &&
380 len
>= (sizeof (ipoib_ptxhdr_t
) + sizeof (struct ip
))) {
384 pre_pc
= prom_gettime();
387 inp
= (struct inetgram
*)bkmem_zalloc(
388 sizeof (struct inetgram
));
391 return (frames
== 0 ? -1 : frames
);
393 offset
= sizeof (ipoib_ptxhdr_t
);
394 data_len
= len
- offset
;
395 inp
->igm_mp
= allocb(data_len
, 0);
396 if (inp
->igm_mp
== NULL
) {
398 bkmem_free((caddr_t
)inp
,
399 sizeof (struct inetgram
));
400 return (frames
== 0 ? -1 : frames
);
402 bcopy((caddr_t
)(mac_state
.mac_buf
+ offset
),
403 inp
->igm_mp
->b_rptr
, data_len
);
404 inp
->igm_mp
->b_wptr
+= data_len
;
405 inp
->igm_level
= NETWORK_LVL
;
406 add_grams(&sockets
[index
].inq
, inp
);
410 tot_pc
+= prom_gettime() - pre_pc
;
415 if (eh
->ipoib_rhdr
.ipoib_type
== ntohs(ETHERTYPE_ARP
) &&
416 len
>= sizeof (struct arp_packet
)) {
422 printf("ibd_input: ARP message received\n");
426 ea
= (struct ibd_arp
*)(mac_state
.mac_buf
+
427 sizeof (ipoib_ptxhdr_t
));
428 if (ea
->arp_pro
!= ntohs(ETHERTYPE_IP
))
432 ip
.s_addr
= ntohl(ip
.s_addr
);
434 if (ea
->arp_op
== ntohs(ARPOP_REQUEST
) &&
435 ip
.s_addr
!= INADDR_ANY
&&
436 (bcmp((caddr_t
)ea
->arp_tpa
, (caddr_t
)&ip
,
437 sizeof (struct in_addr
)) == 0)) {
438 ea
->arp_op
= htons(ARPOP_REPLY
);
439 bcopy((caddr_t
)&ea
->arp_sha
,
440 (caddr_t
)&eh
->ipoib_dest
, IPOIB_ADDRL
);
441 bcopy((caddr_t
)&ea
->arp_sha
,
442 (caddr_t
)&ea
->arp_tha
, IPOIB_ADDRL
);
443 bcopy((caddr_t
)ea
->arp_spa
,
444 (caddr_t
)ea
->arp_tpa
,
445 sizeof (struct in_addr
));
446 bcopy(mac_state
.mac_addr_buf
,
447 (caddr_t
)&ea
->arp_sha
,
448 mac_state
.mac_addr_len
);
449 bcopy((caddr_t
)&ip
, (caddr_t
)ea
->arp_spa
,
450 sizeof (struct in_addr
));
451 (void) prom_write(mac_state
.mac_dev
,
453 sizeof (struct arp_packet
), 0, NETWORK
);
454 /* don't charge for ARP replies */
458 } while (attempts
< IBD_INPUT_ATTEMPTS
&&
460 (now
= prom_gettime()) < timeout
);
462 prom_gettime() < timeout
);
466 printf("ibd_input(%d): T/S/N/A/F/P/M: %d/%d/%d/%d/%d/%d/%d "
467 "T/O: %d < %d = %s\n", index
, total_attempts
, frames
, no_data
,
468 arps
, failures
, tot_pr
, tot_pc
, now
, timeout
,
469 (now
< timeout
) ? "TRUE" : "FALSE");
475 * Send out an IPoIB datagram. We expect a IP frame appropriately fragmented
478 * Errno is set and -1 is returned if an error occurs. Number of bytes sent
479 * is returned on success.
483 ibd_output(int index
, struct inetgram
*ogp
)
485 int header_len
, result
;
488 struct in_addr tmpip
, ipdst
;
489 int broadcast
= FALSE
;
494 prom_panic("IPoIB device is not initialized.");
496 if (ogp
->igm_level
!= MEDIA_LVL
) {
497 dprintf("ibd_output: frame type wrong: socket: %d\n",
503 header_len
= IPOIB_HDRSIZE
+ IPOIB_ADDRL
;
505 size
= mp
->b_wptr
- mp
->b_rptr
;
506 if (size
> (mac_state
.mac_mtu
- IPOIB_ADDRL
)) {
507 dprintf("ibd_output: frame size too big: %d\n", size
);
513 ip
= (struct ip
*)(mp
->b_rptr
);
515 eh
.ipoib_rhdr
.ipoib_type
= htons(ETHERTYPE_IP
);
516 eh
.ipoib_rhdr
.ipoib_mbz
= 0;
517 bcopy((caddr_t
)&ip
->ip_dst
, (caddr_t
)&ipdst
, sizeof (ipdst
));
519 if (ipdst
.s_addr
== htonl(INADDR_BROADCAST
))
520 broadcast
= TRUE
; /* limited broadcast */
525 ipv4_getnetmask(&mask
);
526 mask
.s_addr
= htonl(mask
.s_addr
);
527 if (mask
.s_addr
!= htonl(INADDR_BROADCAST
) &&
528 (ipdst
.s_addr
& ~mask
.s_addr
) == 0) {
529 broadcast
= TRUE
; /* directed broadcast */
531 if (ogp
->igm_router
.s_addr
!= htonl(INADDR_ANY
))
532 tmpip
.s_addr
= ogp
->igm_router
.s_addr
;
534 tmpip
.s_addr
= ipdst
.s_addr
;
536 result
= mac_get_arp(&tmpip
, (void *)&eh
.ipoib_dest
,
537 IPOIB_ADDRL
, mac_state
.mac_arp_timeout
);
540 dprintf("ibd_output: ARP request for %s "
541 "timed out.\n", inet_ntoa(tmpip
));
548 bcopy((caddr_t
)&ibdbroadcastaddr
, (caddr_t
)&eh
.ipoib_dest
,
551 /* add the ibd header */
552 mp
->b_rptr
-= sizeof (eh
);
553 bcopy((caddr_t
)&eh
, mp
->b_rptr
, sizeof (eh
));
556 printf("ibd_output(%d): level(%d) frame(0x%x) len(%d)\n",
557 index
, ogp
->igm_level
, mp
->b_rptr
, size
);
560 return (prom_write(mac_state
.mac_dev
, (char *)mp
->b_rptr
, size
,
568 char *mtuprop
= "ipib-frame-size";
569 char *bcastprop
= "ipib-broadcast";
570 char *addrprop
= "ipib-address";
571 char *cidprop
= "client-id";
573 uint8_t dhcpcid
[DHCP_MAX_CID_LEN
];
575 mac_state
.mac_addr_len
= IPOIB_ADDRL
;
576 mac_state
.mac_addr_buf
= bkmem_alloc(mac_state
.mac_addr_len
);
577 if (mac_state
.mac_addr_buf
== NULL
)
578 prom_panic("ibd_init: Cannot allocate memory.");
580 chosen
= prom_finddevice("/chosen");
581 if (chosen
== OBP_NONODE
|| chosen
== OBP_BADNODE
)
582 prom_panic("ibd_init: Cannot find /chosen.");
584 if (prom_getprop(chosen
, addrprop
, (caddr_t
)mac_state
.mac_addr_buf
) !=
586 prom_panic("ibd_init: Cannot find /chosen:ipib-address\n.");
588 if (prom_getprop(chosen
, bcastprop
, (caddr_t
)&ibdbroadcastaddr
) !=
590 prom_panic("ibd_init: Cannot find /chosen:ipib-broadcast\n.");
592 if (((cidlen
= prom_getproplen(chosen
, cidprop
)) <= 0) ||
593 (cidlen
> DHCP_MAX_CID_LEN
) || (prom_getprop(chosen
, cidprop
,
594 (caddr_t
)&dhcpcid
) != cidlen
))
595 prom_panic("ibd_init: Invalid /chosen:client-id\n.");
596 dhcp_set_client_id(dhcpcid
, cidlen
);
599 * Note that prom reports mtu including 20 bytes of
600 * addressing information.
602 if (prom_getprop(chosen
, mtuprop
,
603 (caddr_t
)&mac_state
.mac_mtu
) <= 0)
604 mac_state
.mac_mtu
= IBDSIZE
+ IPOIB_ADDRL
;
607 * Tell upper layers that we can support a little
608 * more. We will be taking off these 20 bytes at
609 * the start before we invoke prom_write() to send
612 mac_state
.mac_arp_timeout
= IBD_ARP_TIMEOUT
;
613 mac_state
.mac_in_timeout
= IBD_IN_TIMEOUT
;
615 mac_state
.mac_arp
= ibd_arp
;
616 mac_state
.mac_rarp
= ibd_revarp
;
617 mac_state
.mac_header_len
= ibd_header_len
;
618 mac_state
.mac_input
= ibd_input
;
619 mac_state
.mac_output
= ibd_output
;