dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / usr / src / stand / lib / inet / ibd.c
blob3294ba46ffb855c7f07cc9c05991ba43065899f5
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
20 * CDDL HEADER END
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #pragma ident "%Z%%M% %I% %E% SMI"
29 #include <dhcp_impl.h>
30 #include <sys/types.h>
31 #include <socket_impl.h>
32 #include <socket_inet.h>
33 #include <sys/time.h>
34 #include <sys/socket.h>
35 #include <net/if.h>
36 #include <net/if_arp.h>
37 #include <netinet/in_systm.h>
38 #include <netinet/in.h>
39 #include <netinet/ip.h>
40 #include <netinet/if_ether.h>
41 #include <sys/promif.h>
42 #include <sys/prom_plat.h>
43 #include <sys/salib.h>
44 #include <sys/bootdebug.h>
45 #include <sys/ib/clients/ibd/ibd.h>
47 #include "ipv4.h"
48 #include "dhcpv4.h"
49 #include "ipv4_impl.h"
50 #include "mac.h"
51 #include "mac_impl.h"
52 #include "ibd_inet.h"
54 struct ibd_arp {
55 struct arphdr ea_hdr; /* fixed-size header */
56 ipoib_mac_t arp_sha; /* sender hardware address */
57 uchar_t arp_spa[4]; /* sender protocol address */
58 ipoib_mac_t arp_tha; /* target hardware address */
59 uchar_t arp_tpa[4]; /* target protocol address */
62 extern int errno;
63 ipoib_mac_t ibdbroadcastaddr;
66 * Assumptions about OBP behavior (refer FWARC 2002/702, 2003/251):
67 * 1. prom_write() accepts the 20 byte destination address as the
68 * first component in the send buffer. The buffer pointer points
69 * to the start of this 20 byte address. The length parameter is
70 * the IPoIB datagram size with the 20 byte of destination
71 * address.
72 * 2. OBP will not provide max-frame-size, since obp can only
73 * determine that by querying the IBA mcg, and thus the property
74 * has to be /chosen:ipib-frame-size. This will refer to the IPoIB
75 * link MTU as per section 4.0 of ietf i/d, ie, the 4 byte IPoIB
76 * header plus the IP payload mtu. Plus the 20 bytes of addressing
77 * information.
78 * 3. OBP will not provide mac-address property for IPoIB since there
79 * are built in assumptions about 6 byte address with that. Instead,
80 * /chosen:ipib-address will provide the local address.
81 * 4. prom_read() returns 20 byte 0'ed filler followed by 4 byte
82 * IPoIB header followed by IP payload. The return value is -2,
83 * -1, 0, or the length of the received IPoIB datagram alongwith
84 * the 20 bytes MBZ. The buffer pointer points to the start of
85 * the 20 MBZ bytes. The length parameter reflects the max data
86 * size that should be copied into the buffer including the 20
87 * MBZ bytes.
88 * 5. OBP will not provide chosen-network-type, only
89 * network-interface-type = ipib. On an Infiniband device, this
90 * however does not guarantee that it is a network device.
91 * 6. OBP will provide the DHCP client id in /chosen:client-id.
92 * 7. /chosen:ipib-broadcast will provide the broadcast address.
93 * 8. OBP will validate that RARP is not being used before
94 * allowing boot to proceed to inetboot.
97 struct arp_packet {
98 ipoib_ptxhdr_t arp_eh;
99 struct ibd_arp arp_ea;
102 #define dprintf if (boothowto & RB_DEBUG) printf
104 static char *
105 ibd_print(ipoib_mac_t *ea)
107 unsigned char *macaddr = (unsigned char *)ea;
108 static char pbuf[(3 * IPOIB_ADDRL) + 1];
109 int i;
110 char *ptr = pbuf;
112 ptr = pbuf + sprintf(pbuf, "%x", *macaddr++);
113 for (i = 0; i < (IPOIB_ADDRL - 1); i++)
114 ptr += sprintf(ptr, ":%x", *macaddr++);
115 return (pbuf);
120 * Common ARP code. Broadcast the packet and wait for the right response.
122 * If arp is called for, caller expects a hardware address in the
123 * source hardware address (sha) field of the "out" argument.
125 * IPoIB does not support RARP (see ibd_revarp()).
127 * Returns TRUE if transaction succeeded, FALSE otherwise.
129 * The timeout argument is the number of milliseconds to wait for a
130 * response. An infinite timeout can be specified as 0xffffffff.
132 static int
133 ibd_comarp(struct arp_packet *out, uint32_t timeout)
135 struct arp_packet *in = (struct arp_packet *)mac_state.mac_buf;
136 int count, time, feedback, len, delay = 2;
137 char *ind = "-\\|/";
138 struct in_addr tmp_ia;
139 uint32_t wait_time;
141 bcopy((caddr_t)&ibdbroadcastaddr, (caddr_t)&out->arp_eh.ipoib_dest,
142 IPOIB_ADDRL);
144 out->arp_ea.arp_hrd = htons(ARPHRD_IB);
145 out->arp_ea.arp_pro = htons(ETHERTYPE_IP);
146 out->arp_ea.arp_hln = IPOIB_ADDRL;
147 out->arp_ea.arp_pln = sizeof (struct in_addr);
148 bcopy(mac_state.mac_addr_buf, (caddr_t)&out->arp_ea.arp_sha,
149 IPOIB_ADDRL);
150 ipv4_getipaddr(&tmp_ia);
151 tmp_ia.s_addr = htonl(tmp_ia.s_addr);
152 bcopy((caddr_t)&tmp_ia, (caddr_t)out->arp_ea.arp_spa,
153 sizeof (struct in_addr));
154 feedback = 0;
156 wait_time = prom_gettime() + timeout;
157 for (count = 0; timeout == ~0U || prom_gettime() < wait_time; count++) {
158 if (count == IBD_WAITCNT) {
160 * Since IPoIB does not support RARP (see ibd_revarp),
161 * we know that out->arp_ea.arp_op == ARPOP_REQUEST.
163 bcopy((caddr_t)out->arp_ea.arp_tpa,
164 (caddr_t)&tmp_ia, sizeof (struct in_addr));
165 printf("\nRequesting MAC address for: %s\n",
166 inet_ntoa(tmp_ia));
169 (void) prom_write(mac_state.mac_dev, (caddr_t)out,
170 sizeof (*out), 0, NETWORK);
172 if (count >= IBD_WAITCNT)
173 printf("%c\b", ind[feedback++ % 4]); /* activity */
175 time = prom_gettime() + (delay * 1000); /* broadcast delay */
176 while (prom_gettime() <= time) {
177 len = prom_read(mac_state.mac_dev, mac_state.mac_buf,
178 mac_state.mac_mtu, 0, NETWORK);
179 if (len < sizeof (struct arp_packet))
180 continue;
181 if (in->arp_ea.arp_pro != ntohs(ETHERTYPE_IP))
182 continue;
184 * Since IPoIB does not support RARP (see ibd_revarp),
185 * we know that out->arp_ea.arp_op == ARPOP_REQUEST.
187 if (in->arp_eh.ipoib_rhdr.ipoib_type !=
188 ntohs(ETHERTYPE_ARP))
189 continue;
190 if (in->arp_ea.arp_op != ntohs(ARPOP_REPLY))
191 continue;
192 if (bcmp((caddr_t)in->arp_ea.arp_spa,
193 (caddr_t)out->arp_ea.arp_tpa,
194 sizeof (struct in_addr)) != 0)
195 continue;
196 if (boothowto & RB_VERBOSE) {
197 bcopy((caddr_t)in->arp_ea.arp_spa,
198 (caddr_t)&tmp_ia,
199 sizeof (struct in_addr));
200 printf("Found %s @ %s\n",
201 inet_ntoa(tmp_ia),
202 ibd_print(&in->arp_ea.arp_sha));
204 /* copy hardware addr into "out" for caller */
205 bcopy((caddr_t)&in->arp_ea.arp_sha,
206 (caddr_t)&out->arp_ea.arp_sha, IPOIB_ADDRL);
207 return (TRUE);
210 delay = delay * 2; /* Double the request delay */
211 if (delay > 64) /* maximum delay is 64 seconds */
212 delay = 64;
214 return (FALSE);
218 * ARP client side
219 * Broadcasts to determine MAC address given network order IP address.
220 * See RFC 826
222 * Returns TRUE if successful, FALSE otherwise.
224 static int
225 ibd_arp(struct in_addr *ip, void *hap, uint32_t timeout)
227 ipoib_mac_t *ep = (ipoib_mac_t *)hap;
228 struct arp_packet out;
229 int result;
231 if (!initialized)
232 prom_panic("IPoIB device is not initialized.");
234 bzero((char *)&out, sizeof (struct arp_packet));
236 out.arp_eh.ipoib_rhdr.ipoib_type = htons(ETHERTYPE_ARP);
237 out.arp_ea.arp_op = htons(ARPOP_REQUEST);
238 bcopy((caddr_t)&ibdbroadcastaddr, (caddr_t)&out.arp_ea.arp_tha,
239 IPOIB_ADDRL);
240 bcopy((caddr_t)ip, (caddr_t)out.arp_ea.arp_tpa,
241 sizeof (struct in_addr));
243 result = ibd_comarp(&out, timeout);
245 if (result && (ep != NULL)) {
246 bcopy((caddr_t)&out.arp_ea.arp_sha, (caddr_t)ep, IPOIB_ADDRL);
248 return (result);
252 * Reverse ARP client side
253 * Determine our Internet address given our MAC address
254 * See RFC 903
256 static void
257 ibd_revarp(void)
259 prom_panic("IPoIB can not boot with RARP.");
262 /* ARGSUSED */
263 static int
264 ibd_header_len(struct inetgram *igm)
267 * We indicate to upper layers to leave enough space
268 * in output buffers for filling in the IPoIB header
269 * and the 20 byte destination address in ibd_output().
271 return (IPOIB_HDRSIZE + IPOIB_ADDRL);
275 * Handle a IP datagram addressed to our MAC address or to the link
276 * layer broadcast address. Also respond to ARP requests. Generates
277 * inetgrams as long as there's data and the mac level IP timeout timer
278 * hasn't expired. As soon as there is no data, we try for
279 * IBD_INPUT_ATTEMPTS for more, then exit the loop, even if there is time
280 * left, since we expect to have data waiting for us when we're called, we just
281 * don't know how much.
283 * We workaround slow proms (some proms have hard sleeps for as much as 3msec)
284 * even though there are is data waiting.
286 * Returns the total number of MEDIA_LVL frames placed on the socket.
287 * Caller is expected to free up the inetgram resources.
289 static int
290 ibd_input(int index)
292 struct inetgram *inp;
293 ipoib_ptxhdr_t *eh;
294 int frames = 0; /* successful frames */
295 int attempts = 0; /* failed attempts after success */
296 int16_t len = 0, data_len;
297 uint32_t timeout, reltime;
298 uint32_t pre_pr, post_pr; /* prom_read interval */
300 #ifdef DEBUG
301 int failures = 0; /* total failures */
302 int total_attempts = 0; /* total prom_read */
303 int no_data = 0; /* no data in prom */
304 int arps = 0; /* arp requests processed */
305 uint32_t tot_pr = 0; /* prom_read time */
306 uint32_t tot_pc = 0; /* inetgram creation time */
307 uint32_t pre_pc;
308 uint32_t now;
309 #endif /* DEBUG */
311 if (!initialized)
312 prom_panic("IPoIB device is not initialized.");
314 if ((reltime = sockets[index].in_timeout) == 0)
315 reltime = mac_state.mac_in_timeout;
316 timeout = prom_gettime() + reltime;
318 do {
319 if (frames > IBD_MAX_FRAMES) {
320 /* someone is trying a denial of service attack */
321 break;
325 * The following is being paranoid about possible bugs
326 * where prom_read() returns a nonzero length, even when
327 * it's not read a packet; it zeroes out the header to
328 * compensate. Paranoia from calvin prom (V2) days.
330 bzero(mac_state.mac_buf, sizeof (ipoib_ptxhdr_t));
333 * Prom_read() will return 0 or -2 if no data is present. A
334 * return value of -1 means an error has occurred. We adjust
335 * the timeout by calling the time spent in prom_read() "free".
336 * prom_read() returns the number of bytes actually read, but
337 * will only copy "len" bytes into our buffer. Adjust in
338 * case the MTU is wrong.
340 pre_pr = prom_gettime();
341 len = prom_read(mac_state.mac_dev, mac_state.mac_buf,
342 mac_state.mac_mtu, 0, NETWORK);
343 post_pr = prom_gettime();
344 timeout += (post_pr - pre_pr);
345 #ifdef DEBUG
346 tot_pr += (post_pr - pre_pr);
347 total_attempts++;
348 #endif /* DEBUG */
350 if (len > mac_state.mac_mtu) {
351 dprintf("ibd_input: adjusting MTU %d -> %d\n",
352 mac_state.mac_mtu, len);
353 bkmem_free(mac_state.mac_buf, mac_state.mac_mtu);
354 mac_state.mac_mtu = len;
355 mac_state.mac_buf = bkmem_alloc(mac_state.mac_mtu);
356 if (mac_state.mac_buf == NULL) {
357 prom_panic("ibd_input: Cannot reallocate "
358 "netbuf memory.");
360 len = 0; /* pretend there was no data */
363 if (len == -1) {
364 #ifdef DEBUG
365 failures++;
366 #endif /* DEBUG */
367 break;
369 if (len == 0 || len == -2) {
370 if (frames != 0)
371 attempts++;
372 #ifdef DEBUG
373 no_data++;
374 #endif /* DEBUG */
375 continue;
378 eh = (ipoib_ptxhdr_t *)mac_state.mac_buf;
379 if (eh->ipoib_rhdr.ipoib_type == ntohs(ETHERTYPE_IP) &&
380 len >= (sizeof (ipoib_ptxhdr_t) + sizeof (struct ip))) {
382 int offset;
383 #ifdef DEBUG
384 pre_pc = prom_gettime();
385 #endif /* DEBUG */
387 inp = (struct inetgram *)bkmem_zalloc(
388 sizeof (struct inetgram));
389 if (inp == NULL) {
390 errno = ENOMEM;
391 return (frames == 0 ? -1 : frames);
393 offset = sizeof (ipoib_ptxhdr_t);
394 data_len = len - offset;
395 inp->igm_mp = allocb(data_len, 0);
396 if (inp->igm_mp == NULL) {
397 errno = ENOMEM;
398 bkmem_free((caddr_t)inp,
399 sizeof (struct inetgram));
400 return (frames == 0 ? -1 : frames);
402 bcopy((caddr_t)(mac_state.mac_buf + offset),
403 inp->igm_mp->b_rptr, data_len);
404 inp->igm_mp->b_wptr += data_len;
405 inp->igm_level = NETWORK_LVL;
406 add_grams(&sockets[index].inq, inp);
407 frames++;
408 attempts = 0;
409 #ifdef DEBUG
410 tot_pc += prom_gettime() - pre_pc;
411 #endif /* DEBUG */
412 continue;
415 if (eh->ipoib_rhdr.ipoib_type == ntohs(ETHERTYPE_ARP) &&
416 len >= sizeof (struct arp_packet)) {
418 struct in_addr ip;
419 struct ibd_arp *ea;
421 #ifdef DEBUG
422 printf("ibd_input: ARP message received\n");
423 arps++;
424 #endif /* DEBUG */
426 ea = (struct ibd_arp *)(mac_state.mac_buf +
427 sizeof (ipoib_ptxhdr_t));
428 if (ea->arp_pro != ntohs(ETHERTYPE_IP))
429 continue;
431 ipv4_getipaddr(&ip);
432 ip.s_addr = ntohl(ip.s_addr);
434 if (ea->arp_op == ntohs(ARPOP_REQUEST) &&
435 ip.s_addr != INADDR_ANY &&
436 (bcmp((caddr_t)ea->arp_tpa, (caddr_t)&ip,
437 sizeof (struct in_addr)) == 0)) {
438 ea->arp_op = htons(ARPOP_REPLY);
439 bcopy((caddr_t)&ea->arp_sha,
440 (caddr_t)&eh->ipoib_dest, IPOIB_ADDRL);
441 bcopy((caddr_t)&ea->arp_sha,
442 (caddr_t)&ea->arp_tha, IPOIB_ADDRL);
443 bcopy((caddr_t)ea->arp_spa,
444 (caddr_t)ea->arp_tpa,
445 sizeof (struct in_addr));
446 bcopy(mac_state.mac_addr_buf,
447 (caddr_t)&ea->arp_sha,
448 mac_state.mac_addr_len);
449 bcopy((caddr_t)&ip, (caddr_t)ea->arp_spa,
450 sizeof (struct in_addr));
451 (void) prom_write(mac_state.mac_dev,
452 mac_state.mac_buf,
453 sizeof (struct arp_packet), 0, NETWORK);
454 /* don't charge for ARP replies */
455 timeout += reltime;
458 } while (attempts < IBD_INPUT_ATTEMPTS &&
459 #ifdef DEBUG
460 (now = prom_gettime()) < timeout);
461 #else
462 prom_gettime() < timeout);
463 #endif /* DEBUG */
465 #ifdef DEBUG
466 printf("ibd_input(%d): T/S/N/A/F/P/M: %d/%d/%d/%d/%d/%d/%d "
467 "T/O: %d < %d = %s\n", index, total_attempts, frames, no_data,
468 arps, failures, tot_pr, tot_pc, now, timeout,
469 (now < timeout) ? "TRUE" : "FALSE");
470 #endif /* DEBUG */
471 return (frames);
475 * Send out an IPoIB datagram. We expect a IP frame appropriately fragmented
476 * at this level.
478 * Errno is set and -1 is returned if an error occurs. Number of bytes sent
479 * is returned on success.
481 /* ARGSUSED */
482 static int
483 ibd_output(int index, struct inetgram *ogp)
485 int header_len, result;
486 ipoib_ptxhdr_t eh;
487 struct ip *ip;
488 struct in_addr tmpip, ipdst;
489 int broadcast = FALSE;
490 int size;
491 mblk_t *mp;
493 if (!initialized)
494 prom_panic("IPoIB device is not initialized.");
496 if (ogp->igm_level != MEDIA_LVL) {
497 dprintf("ibd_output: frame type wrong: socket: %d\n",
498 index * SOCKETTYPE);
499 errno = EINVAL;
500 return (-1);
503 header_len = IPOIB_HDRSIZE + IPOIB_ADDRL;
504 mp = ogp->igm_mp;
505 size = mp->b_wptr - mp->b_rptr;
506 if (size > (mac_state.mac_mtu - IPOIB_ADDRL)) {
507 dprintf("ibd_output: frame size too big: %d\n", size);
508 errno = E2BIG;
509 return (-1);
512 size += header_len;
513 ip = (struct ip *)(mp->b_rptr);
515 eh.ipoib_rhdr.ipoib_type = htons(ETHERTYPE_IP);
516 eh.ipoib_rhdr.ipoib_mbz = 0;
517 bcopy((caddr_t)&ip->ip_dst, (caddr_t)&ipdst, sizeof (ipdst));
519 if (ipdst.s_addr == htonl(INADDR_BROADCAST))
520 broadcast = TRUE; /* limited broadcast */
522 if (!broadcast) {
523 struct in_addr mask;
525 ipv4_getnetmask(&mask);
526 mask.s_addr = htonl(mask.s_addr);
527 if (mask.s_addr != htonl(INADDR_BROADCAST) &&
528 (ipdst.s_addr & ~mask.s_addr) == 0) {
529 broadcast = TRUE; /* directed broadcast */
530 } else {
531 if (ogp->igm_router.s_addr != htonl(INADDR_ANY))
532 tmpip.s_addr = ogp->igm_router.s_addr;
533 else
534 tmpip.s_addr = ipdst.s_addr;
536 result = mac_get_arp(&tmpip, (void *)&eh.ipoib_dest,
537 IPOIB_ADDRL, mac_state.mac_arp_timeout);
538 if (!result) {
539 errno = ETIMEDOUT;
540 dprintf("ibd_output: ARP request for %s "
541 "timed out.\n", inet_ntoa(tmpip));
542 return (-1);
547 if (broadcast)
548 bcopy((caddr_t)&ibdbroadcastaddr, (caddr_t)&eh.ipoib_dest,
549 IPOIB_ADDRL);
551 /* add the ibd header */
552 mp->b_rptr -= sizeof (eh);
553 bcopy((caddr_t)&eh, mp->b_rptr, sizeof (eh));
555 #ifdef DEBUG
556 printf("ibd_output(%d): level(%d) frame(0x%x) len(%d)\n",
557 index, ogp->igm_level, mp->b_rptr, size);
558 #endif /* DEBUG */
560 return (prom_write(mac_state.mac_dev, (char *)mp->b_rptr, size,
561 0, NETWORK));
564 void
565 ibd_init(void)
567 pnode_t chosen;
568 char *mtuprop = "ipib-frame-size";
569 char *bcastprop = "ipib-broadcast";
570 char *addrprop = "ipib-address";
571 char *cidprop = "client-id";
572 int cidlen;
573 uint8_t dhcpcid[DHCP_MAX_CID_LEN];
575 mac_state.mac_addr_len = IPOIB_ADDRL;
576 mac_state.mac_addr_buf = bkmem_alloc(mac_state.mac_addr_len);
577 if (mac_state.mac_addr_buf == NULL)
578 prom_panic("ibd_init: Cannot allocate memory.");
580 chosen = prom_finddevice("/chosen");
581 if (chosen == OBP_NONODE || chosen == OBP_BADNODE)
582 prom_panic("ibd_init: Cannot find /chosen.");
584 if (prom_getprop(chosen, addrprop, (caddr_t)mac_state.mac_addr_buf) !=
585 IPOIB_ADDRL)
586 prom_panic("ibd_init: Cannot find /chosen:ipib-address\n.");
588 if (prom_getprop(chosen, bcastprop, (caddr_t)&ibdbroadcastaddr) !=
589 IPOIB_ADDRL)
590 prom_panic("ibd_init: Cannot find /chosen:ipib-broadcast\n.");
592 if (((cidlen = prom_getproplen(chosen, cidprop)) <= 0) ||
593 (cidlen > DHCP_MAX_CID_LEN) || (prom_getprop(chosen, cidprop,
594 (caddr_t)&dhcpcid) != cidlen))
595 prom_panic("ibd_init: Invalid /chosen:client-id\n.");
596 dhcp_set_client_id(dhcpcid, cidlen);
599 * Note that prom reports mtu including 20 bytes of
600 * addressing information.
602 if (prom_getprop(chosen, mtuprop,
603 (caddr_t)&mac_state.mac_mtu) <= 0)
604 mac_state.mac_mtu = IBDSIZE + IPOIB_ADDRL;
607 * Tell upper layers that we can support a little
608 * more. We will be taking off these 20 bytes at
609 * the start before we invoke prom_write() to send
610 * over the wire.
612 mac_state.mac_arp_timeout = IBD_ARP_TIMEOUT;
613 mac_state.mac_in_timeout = IBD_IN_TIMEOUT;
615 mac_state.mac_arp = ibd_arp;
616 mac_state.mac_rarp = ibd_revarp;
617 mac_state.mac_header_len = ibd_header_len;
618 mac_state.mac_input = ibd_input;
619 mac_state.mac_output = ibd_output;