Sync usage with man page.
[netbsd-mini2440.git] / sys / dev / pci / cxgb_l2t.c
blobdeed6c4f57a1e24e43164eb379f2dfea175ada25
1 /**************************************************************************
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 #ifdef __NetBSD__
32 __KERNEL_RCSID(0, "$NetBSD: cxgb_l2t.c,v 1.6 2008/01/04 21:18:01 ad Exp $");
33 #endif
34 #ifdef __FreeBSD__
35 __FBSDID("$FreeBSD: src/sys/dev/cxgb/cxgb_l2t.c,v 1.3 2007/08/17 05:57:03 kmacy Exp $");
36 #endif
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #ifdef __FreeBSD__
42 #include <sys/module.h>
43 #include <sys/bus.h>
44 #endif
45 #include <sys/lock.h>
46 #include <sys/mutex.h>
47 #if __FreeBSD_version > 700000
48 #include <sys/rwlock.h>
49 #endif
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 #include <net/if.h>
54 #ifdef __FreeBSD__
55 #include <net/ethernet.h>
56 #include <net/if_vlan_var.h>
57 #endif
58 #ifdef __NetBSD__
59 #include <netinet/in.h>
60 #include <netinet/in_var.h>
61 #include <netinet/if_inarp.h>
62 #endif
63 #include <net/if_dl.h>
64 #include <net/route.h>
65 #include <netinet/in.h>
66 #ifdef __FreeBSD__
67 #include <netinet/if_ether.h>
68 #endif
70 #ifdef CONFIG_DEFINED
71 #include <cxgb_include.h>
72 #else
73 #ifdef __FreeBSD__
74 #include <dev/cxgb/cxgb_include.h>
75 #endif
76 #ifdef __NetBSD__
77 #include "cxgb_include.h"
78 #endif
79 #endif
81 #define VLAN_NONE 0xfff
82 #define SDL(s) ((struct sockaddr_dl *)s)
83 #define RT_ENADDR(rt) ((u_char *)LLADDR(SDL((rt))))
84 #define rt_expire rt_rmx.rmx_expire
86 #ifdef __FreeBSD__
87 struct llinfo_arp {
88 struct callout la_timer;
89 struct rtentry *la_rt;
90 struct mbuf *la_hold; /* last packet until resolved/timeout */
91 u_short la_preempt; /* countdown for pre-expiry arps */
92 u_short la_asked; /* # requests sent */
94 #endif
97 * Module locking notes: There is a RW lock protecting the L2 table as a
98 * whole plus a spinlock per L2T entry. Entry lookups and allocations happen
99 * under the protection of the table lock, individual entry changes happen
100 * while holding that entry's spinlock. The table lock nests outside the
101 * entry locks. Allocations of new entries take the table lock as writers so
102 * no other lookups can happen while allocating new entries. Entry updates
103 * take the table lock as readers so multiple entries can be updated in
104 * parallel. An L2T entry can be dropped by decrementing its reference count
105 * and therefore can happen in parallel with entry allocation but no entry
106 * can change state or increment its ref count during allocation as both of
107 * these perform lookups.
110 static inline unsigned int
111 vlan_prio(const struct l2t_entry *e)
113 return e->vlan >> 13;
116 static inline unsigned int
117 arp_hash(u32 key, int ifindex, const struct l2t_data *d)
119 return jhash_2words(key, ifindex, 0) & (d->nentries - 1);
122 static inline void
123 neigh_replace(struct l2t_entry *e, struct rtentry *rt)
125 RT_LOCK(rt);
126 RT_ADDREF(rt);
127 RT_UNLOCK(rt);
129 if (e->neigh) {
130 RT_LOCK(e->neigh);
131 RT_REMREF(e->neigh);
132 RT_UNLOCK(e->neigh);
134 e->neigh = rt;
138 * Set up an L2T entry and send any packets waiting in the arp queue. The
139 * supplied mbuf is used for the CPL_L2T_WRITE_REQ. Must be called with the
140 * entry locked.
142 static int
143 setup_l2e_send_pending(struct toedev *dev, struct mbuf *m,
144 struct l2t_entry *e)
146 struct cpl_l2t_write_req *req;
148 if (!m) {
149 if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
150 return (ENOMEM);
153 * XXX MH_ALIGN
155 req = mtod(m, struct cpl_l2t_write_req *);
156 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
157 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx));
158 req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) |
159 V_L2T_W_VLAN(e->vlan & EVL_VLID_MASK) |
160 V_L2T_W_PRIO(vlan_prio(e)));
162 memcpy(e->dmac, RT_ENADDR(e->neigh), sizeof(e->dmac));
163 memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
164 m_set_priority(m, CPL_PRIORITY_CONTROL);
165 #ifdef __FreeBSD__
166 cxgb_ofld_send(dev, m);
167 #endif
168 while (e->arpq_head) {
169 m = e->arpq_head;
170 e->arpq_head = m->m_next;
171 m->m_next = NULL;
172 #ifdef __FreeBSD__
173 cxgb_ofld_send(dev, m);
174 #endif
176 e->arpq_tail = NULL;
177 e->state = L2T_STATE_VALID;
179 return 0;
183 * Add a packet to the an L2T entry's queue of packets awaiting resolution.
184 * Must be called with the entry's lock held.
186 static inline void
187 arpq_enqueue(struct l2t_entry *e, struct mbuf *m)
189 m->m_next = NULL;
190 if (e->arpq_head)
191 e->arpq_tail->m_next = m;
192 else
193 e->arpq_head = m;
194 e->arpq_tail = m;
198 t3_l2t_send_slow(struct toedev *dev, struct mbuf *m,
199 struct l2t_entry *e)
201 struct rtentry *rt;
202 struct mbuf *m0;
204 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
205 return (ENOMEM);
207 rt = e->neigh;
209 again:
210 switch (e->state) {
211 case L2T_STATE_STALE: /* entry is stale, kick off revalidation */
212 arpresolve(rt->rt_ifp, rt, m0, rt->rt_gateway, RT_ENADDR(rt));
213 mtx_lock(&e->lock);
214 if (e->state == L2T_STATE_STALE)
215 e->state = L2T_STATE_VALID;
216 mtx_unlock(&e->lock);
217 case L2T_STATE_VALID: /* fast-path, send the packet on */
218 #ifdef __FreeBSD__
219 return cxgb_ofld_send(dev, m);
220 #endif
221 case L2T_STATE_RESOLVING:
222 mtx_lock(&e->lock);
223 if (e->state != L2T_STATE_RESOLVING) { // ARP already completed
224 mtx_unlock(&e->lock);
225 goto again;
227 arpq_enqueue(e, m);
228 mtx_unlock(&e->lock);
230 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
231 return (ENOMEM);
233 * Only the first packet added to the arpq should kick off
234 * resolution. However, because the m_gethdr below can fail,
235 * we allow each packet added to the arpq to retry resolution
236 * as a way of recovering from transient memory exhaustion.
237 * A better way would be to use a work request to retry L2T
238 * entries when there's no memory.
240 if (arpresolve(rt->rt_ifp, rt, m0, rt->rt_gateway, RT_ENADDR(rt)) == 0) {
242 mtx_lock(&e->lock);
243 if (e->arpq_head)
244 setup_l2e_send_pending(dev, m, e);
245 else
246 m_freem(m);
247 mtx_unlock(&e->lock);
250 return 0;
253 void
254 t3_l2t_send_event(struct toedev *dev, struct l2t_entry *e)
256 struct rtentry *rt;
257 struct mbuf *m0;
259 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
260 return;
262 rt = e->neigh;
263 again:
264 switch (e->state) {
265 case L2T_STATE_STALE: /* entry is stale, kick off revalidation */
266 arpresolve(rt->rt_ifp, rt, m0, rt->rt_gateway, RT_ENADDR(rt));
267 mtx_lock(&e->lock);
268 if (e->state == L2T_STATE_STALE) {
269 e->state = L2T_STATE_VALID;
271 mtx_unlock(&e->lock);
272 return;
273 case L2T_STATE_VALID: /* fast-path, send the packet on */
274 return;
275 case L2T_STATE_RESOLVING:
276 mtx_lock(&e->lock);
277 if (e->state != L2T_STATE_RESOLVING) { // ARP already completed
278 mtx_unlock(&e->lock);
279 goto again;
281 mtx_unlock(&e->lock);
283 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
284 return;
286 * Only the first packet added to the arpq should kick off
287 * resolution. However, because the alloc_skb below can fail,
288 * we allow each packet added to the arpq to retry resolution
289 * as a way of recovering from transient memory exhaustion.
290 * A better way would be to use a work request to retry L2T
291 * entries when there's no memory.
293 arpresolve(rt->rt_ifp, rt, m0, rt->rt_gateway, RT_ENADDR(rt));
296 return;
299 * Allocate a free L2T entry. Must be called with l2t_data.lock held.
301 static struct l2t_entry *
302 alloc_l2e(struct l2t_data *d)
304 struct l2t_entry *end, *e, **p;
306 if (!atomic_load_acq_int(&d->nfree))
307 return NULL;
309 /* there's definitely a free entry */
310 for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e)
311 if (atomic_load_acq_int(&e->refcnt) == 0)
312 goto found;
314 for (e = &d->l2tab[1]; atomic_load_acq_int(&e->refcnt); ++e) ;
315 found:
316 d->rover = e + 1;
317 atomic_add_int(&d->nfree, -1);
320 * The entry we found may be an inactive entry that is
321 * presently in the hash table. We need to remove it.
323 if (e->state != L2T_STATE_UNUSED) {
324 int hash = arp_hash(e->addr, e->ifindex, d);
326 for (p = &d->l2tab[hash].first; *p; p = &(*p)->next)
327 if (*p == e) {
328 *p = e->next;
329 break;
331 e->state = L2T_STATE_UNUSED;
333 return e;
337 * Called when an L2T entry has no more users. The entry is left in the hash
338 * table since it is likely to be reused but we also bump nfree to indicate
339 * that the entry can be reallocated for a different neighbor. We also drop
340 * the existing neighbor reference in case the neighbor is going away and is
341 * waiting on our reference.
343 * Because entries can be reallocated to other neighbors once their ref count
344 * drops to 0 we need to take the entry's lock to avoid races with a new
345 * incarnation.
347 void
348 t3_l2e_free(struct l2t_data *d, struct l2t_entry *e)
350 mtx_lock(&e->lock);
351 if (atomic_load_acq_int(&e->refcnt) == 0) { /* hasn't been recycled */
352 if (e->neigh) {
353 RT_LOCK(e->neigh);
354 RT_REMREF(e->neigh);
355 RT_UNLOCK(e->neigh);
356 e->neigh = NULL;
359 mtx_unlock(&e->lock);
360 atomic_add_int(&d->nfree, 1);
364 * Update an L2T entry that was previously used for the same next hop as neigh.
365 * Must be called with softirqs disabled.
367 static inline void
368 reuse_entry(struct l2t_entry *e, struct rtentry *neigh)
370 struct llinfo_arp *la;
372 la = (struct llinfo_arp *)neigh->rt_llinfo;
374 mtx_lock(&e->lock); /* avoid race with t3_l2t_free */
375 if (neigh != e->neigh)
376 neigh_replace(e, neigh);
378 if (memcmp(e->dmac, RT_ENADDR(neigh), sizeof(e->dmac)) ||
379 (neigh->rt_expire > time_uptime))
380 e->state = L2T_STATE_RESOLVING;
381 else if (la->la_hold == NULL)
382 e->state = L2T_STATE_VALID;
383 else
384 e->state = L2T_STATE_STALE;
385 mtx_unlock(&e->lock);
388 struct l2t_entry *
389 t3_l2t_get(struct toedev *dev, struct rtentry *neigh,
390 unsigned int smt_idx)
392 struct l2t_entry *e;
393 struct l2t_data *d = L2DATA(dev);
394 u32 addr = *(u32 *)neigh->_rt_key;
395 int ifidx = neigh->rt_ifp->if_index;
396 int hash = arp_hash(addr, ifidx, d);
398 rw_wlock(&d->lock);
399 for (e = d->l2tab[hash].first; e; e = e->next)
400 if (e->addr == addr && e->ifindex == ifidx &&
401 e->smt_idx == smt_idx) {
402 l2t_hold(d, e);
403 if (atomic_load_acq_int(&e->refcnt) == 1)
404 reuse_entry(e, neigh);
405 goto done;
408 /* Need to allocate a new entry */
409 e = alloc_l2e(d);
410 if (e) {
411 mtx_lock(&e->lock); /* avoid race with t3_l2t_free */
412 e->next = d->l2tab[hash].first;
413 d->l2tab[hash].first = e;
414 e->state = L2T_STATE_RESOLVING;
415 e->addr = addr;
416 e->ifindex = ifidx;
417 e->smt_idx = smt_idx;
418 atomic_store_rel_int(&e->refcnt, 1);
419 neigh_replace(e, neigh);
420 #ifdef notyet
422 * XXX need to add accessor function for vlan tag
424 if (neigh->rt_ifp->if_vlantrunk)
425 e->vlan = VLAN_DEV_INFO(neigh->dev)->vlan_id;
426 else
427 #endif
428 e->vlan = VLAN_NONE;
429 mtx_unlock(&e->lock);
431 done:
432 rw_wunlock(&d->lock);
433 return e;
437 * Called when address resolution fails for an L2T entry to handle packets
438 * on the arpq head. If a packet specifies a failure handler it is invoked,
439 * otherwise the packets is sent to the TOE.
441 * XXX: maybe we should abandon the latter behavior and just require a failure
442 * handler.
444 static void
445 handle_failed_resolution(struct toedev *dev, struct mbuf *arpq)
448 while (arpq) {
449 struct mbuf *m = arpq;
450 #ifdef notyet
451 struct l2t_mbuf_cb *cb = L2T_MBUF_CB(m);
452 #endif
453 arpq = m->m_next;
454 m->m_next = NULL;
455 #ifdef notyet
456 if (cb->arp_failure_handler)
457 cb->arp_failure_handler(dev, m);
458 else
459 #endif
460 #ifdef __FreeBSD__
461 cxgb_ofld_send(dev, m);
462 #endif
467 #if defined(NETEVENT) || !defined(CONFIG_CHELSIO_T3_MODULE)
469 * Called when the host's ARP layer makes a change to some entry that is
470 * loaded into the HW L2 table.
472 void
473 t3_l2t_update(struct toedev *dev, struct rtentry *neigh)
475 struct l2t_entry *e;
476 struct mbuf *arpq = NULL;
477 struct l2t_data *d = L2DATA(dev);
478 u32 addr = *(u32 *)neigh->_rt_key;
479 int ifidx = neigh->rt_ifp->if_index;
480 int hash = arp_hash(addr, ifidx, d);
481 struct llinfo_arp *la;
483 rw_rlock(&d->lock);
484 for (e = d->l2tab[hash].first; e; e = e->next)
485 if (e->addr == addr && e->ifindex == ifidx) {
486 mtx_lock(&e->lock);
487 goto found;
489 rw_runlock(&d->lock);
490 return;
492 found:
493 rw_runlock(&d->lock);
494 if (atomic_load_acq_int(&e->refcnt)) {
495 if (neigh != e->neigh)
496 neigh_replace(e, neigh);
498 la = (struct llinfo_arp *)neigh->rt_llinfo;
499 if (e->state == L2T_STATE_RESOLVING) {
501 if (la->la_asked >= 5 /* arp_maxtries */) {
502 arpq = e->arpq_head;
503 e->arpq_head = e->arpq_tail = NULL;
504 } else if (la->la_hold == NULL)
505 setup_l2e_send_pending(dev, NULL, e);
506 } else {
507 e->state = (la->la_hold == NULL) ?
508 L2T_STATE_VALID : L2T_STATE_STALE;
509 if (memcmp(e->dmac, RT_ENADDR(neigh), 6))
510 setup_l2e_send_pending(dev, NULL, e);
513 mtx_unlock(&e->lock);
515 if (arpq)
516 handle_failed_resolution(dev, arpq);
518 #else
520 * Called from a kprobe, interrupts are off.
522 void
523 t3_l2t_update(struct toedev *dev, struct rtentry *neigh)
525 struct l2t_entry *e;
526 struct l2t_data *d = L2DATA(dev);
527 u32 addr = *(u32 *) rt_key(neigh);
528 int ifidx = neigh->dev->ifindex;
529 int hash = arp_hash(addr, ifidx, d);
531 rw_rlock(&d->lock);
532 for (e = d->l2tab[hash].first; e; e = e->next)
533 if (e->addr == addr && e->ifindex == ifidx) {
534 mtx_lock(&e->lock);
535 if (atomic_load_acq_int(&e->refcnt)) {
536 if (neigh != e->neigh)
537 neigh_replace(e, neigh);
538 e->tdev = dev;
539 mod_timer(&e->update_timer, jiffies + 1);
541 mtx_unlock(&e->lock);
542 break;
544 rw_runlock(&d->lock);
547 static void
548 update_timer_cb(unsigned long data)
550 struct mbuf *arpq = NULL;
551 struct l2t_entry *e = (struct l2t_entry *)data;
552 struct rtentry *neigh = e->neigh;
553 struct toedev *dev = e->tdev;
555 barrier();
556 if (!atomic_load_acq_int(&e->refcnt))
557 return;
559 rw_rlock(&neigh->lock);
560 mtx_lock(&e->lock);
562 if (atomic_load_acq_int(&e->refcnt)) {
563 if (e->state == L2T_STATE_RESOLVING) {
564 if (neigh->nud_state & NUD_FAILED) {
565 arpq = e->arpq_head;
566 e->arpq_head = e->arpq_tail = NULL;
567 } else if (neigh_is_connected(neigh) && e->arpq_head)
568 setup_l2e_send_pending(dev, NULL, e);
569 } else {
570 e->state = neigh_is_connected(neigh) ?
571 L2T_STATE_VALID : L2T_STATE_STALE;
572 if (memcmp(e->dmac, RT_ENADDR(neigh), sizeof(e->dmac)))
573 setup_l2e_send_pending(dev, NULL, e);
576 mtx_unlock(&e->lock);
577 rw_runlock(&neigh->lock);
579 if (arpq)
580 handle_failed_resolution(dev, arpq);
582 #endif
584 struct l2t_data *
585 t3_init_l2t(unsigned int l2t_capacity)
587 struct l2t_data *d;
588 int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry);
590 d = cxgb_alloc_mem(size);
591 if (!d)
592 return NULL;
594 d->nentries = l2t_capacity;
595 d->rover = &d->l2tab[1]; /* entry 0 is not used */
596 atomic_store_rel_int(&d->nfree, l2t_capacity - 1);
597 rw_init(&d->lock, "L2T");
599 for (i = 0; i < l2t_capacity; ++i) {
600 d->l2tab[i].idx = i;
601 d->l2tab[i].state = L2T_STATE_UNUSED;
602 mtx_init(&d->l2tab[i].lock, "L2TAB", NULL, MTX_DEF);
603 atomic_store_rel_int(&d->l2tab[i].refcnt, 0);
604 #ifndef NETEVENT
605 #ifdef CONFIG_CHELSIO_T3_MODULE
606 setup_timer(&d->l2tab[i].update_timer, update_timer_cb,
607 (unsigned long)&d->l2tab[i]);
608 #endif
609 #endif
611 return d;
614 void
615 t3_free_l2t(struct l2t_data *d)
617 #ifndef NETEVENT
618 #ifdef CONFIG_CHELSIO_T3_MODULE
619 int i;
621 /* Stop all L2T timers */
622 for (i = 0; i < d->nentries; ++i)
623 del_timer_sync(&d->l2tab[i].update_timer);
624 #endif
625 #endif
626 cxgb_free_mem(d);
629 #ifdef CONFIG_PROC_FS
630 #include <linux/module.h>
631 #include <linux/proc_fs.h>
632 #include <linux/seq_file.h>
634 static inline void *
635 l2t_get_idx(struct seq_file *seq, loff_t pos)
637 struct l2t_data *d = seq->private;
639 return pos >= d->nentries ? NULL : &d->l2tab[pos];
642 static void *
643 l2t_seq_start(struct seq_file *seq, loff_t *pos)
645 return *pos ? l2t_get_idx(seq, *pos) : SEQ_START_TOKEN;
648 static void *
649 l2t_seq_next(struct seq_file *seq, void *v, loff_t *pos)
651 v = l2t_get_idx(seq, *pos + 1);
652 if (v)
653 ++*pos;
654 return v;
657 static void
658 l2t_seq_stop(struct seq_file *seq, void *v)
662 static char
663 l2e_state(const struct l2t_entry *e)
665 switch (e->state) {
666 case L2T_STATE_VALID: return 'V'; /* valid, fast-path entry */
667 case L2T_STATE_STALE: return 'S'; /* needs revalidation, but usable */
668 case L2T_STATE_RESOLVING:
669 return e->arpq_head ? 'A' : 'R';
670 default:
671 return 'U';
675 static int
676 l2t_seq_show(struct seq_file *seq, void *v)
678 if (v == SEQ_START_TOKEN)
679 seq_puts(seq, "Index IP address Ethernet address VLAN "
680 "Prio State Users SMTIDX Port\n");
681 else {
682 char ip[20];
683 struct l2t_entry *e = v;
685 mtx_lock(&e->lock);
686 sprintf(ip, "%u.%u.%u.%u", NIPQUAD(e->addr));
687 seq_printf(seq, "%-5u %-15s %02x:%02x:%02x:%02x:%02x:%02x %4d"
688 " %3u %c %7u %4u %s\n",
689 e->idx, ip, e->dmac[0], e->dmac[1], e->dmac[2],
690 e->dmac[3], e->dmac[4], e->dmac[5],
691 e->vlan & EVL_VLID_MASK, vlan_prio(e),
692 l2e_state(e), atomic_load_acq_int(&e->refcnt), e->smt_idx,
693 e->neigh ? e->neigh->dev->name : "");
694 mtx_unlock(&e->lock);
696 return 0;
699 #endif