2 * net/core/dst.c Protocol independent destination cache.
4 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
8 #include <linux/bitops.h>
9 #include <linux/errno.h>
10 #include <linux/init.h>
11 #include <linux/kernel.h>
12 #include <linux/workqueue.h>
14 #include <linux/module.h>
15 #include <linux/slab.h>
16 #include <linux/netdevice.h>
17 #include <linux/skbuff.h>
18 #include <linux/string.h>
19 #include <linux/types.h>
20 #include <net/net_namespace.h>
21 #include <linux/sched.h>
22 #include <linux/prefetch.h>
23 #include <net/lwtunnel.h>
26 #include <net/dst_metadata.h>
29 * Theory of operations:
30 * 1) We use a list, protected by a spinlock, to add
31 * new entries from both BH and non-BH context.
32 * 2) In order to keep spinlock held for a small delay,
33 * we use a second list where are stored long lived
34 * entries, that are handled by the garbage collect thread
35 * fired by a workqueue.
36 * 3) This list is guarded by a mutex,
37 * so that the gc_task and dst_dev_event() can be synchronized.
41 * We want to keep lock & list close together
42 * to dirty as few cache lines as possible in __dst_free().
43 * As this is not a very strong hint, we dont force an alignment on SMP.
47 struct dst_entry
*list
;
48 unsigned long timer_inc
;
49 unsigned long timer_expires
;
51 .lock
= __SPIN_LOCK_UNLOCKED(dst_garbage
.lock
),
52 .timer_inc
= DST_GC_MAX
,
54 static void dst_gc_task(struct work_struct
*work
);
55 static void ___dst_free(struct dst_entry
*dst
);
57 static DECLARE_DELAYED_WORK(dst_gc_work
, dst_gc_task
);
59 static DEFINE_MUTEX(dst_gc_mutex
);
61 * long lived entries are maintained in this list, guarded by dst_gc_mutex
63 static struct dst_entry
*dst_busy_list
;
65 static void dst_gc_task(struct work_struct
*work
)
68 int work_performed
= 0;
69 unsigned long expires
= ~0L;
70 struct dst_entry
*dst
, *next
, head
;
71 struct dst_entry
*last
= &head
;
73 mutex_lock(&dst_gc_mutex
);
77 while ((dst
= next
) != NULL
) {
79 prefetch(&next
->next
);
81 if (likely(atomic_read(&dst
->__refcnt
))) {
89 dst
= dst_destroy(dst
);
91 /* NOHASH and still referenced. Unless it is already
92 * on gc list, invalidate it and add to gc list.
94 * Note: this is temporary. Actually, NOHASH dst's
95 * must be obsoleted when parent is obsoleted.
96 * But we do not have state "obsoleted, but
97 * referenced by parent", so it is right.
99 if (dst
->obsolete
> 0)
108 spin_lock_bh(&dst_garbage
.lock
);
109 next
= dst_garbage
.list
;
111 dst_garbage
.list
= NULL
;
112 spin_unlock_bh(&dst_garbage
.lock
);
116 dst_busy_list
= head
.next
;
118 dst_garbage
.timer_inc
= DST_GC_MAX
;
121 * if we freed less than 1/10 of delayed entries,
122 * we can sleep longer.
124 if (work_performed
<= delayed
/10) {
125 dst_garbage
.timer_expires
+= dst_garbage
.timer_inc
;
126 if (dst_garbage
.timer_expires
> DST_GC_MAX
)
127 dst_garbage
.timer_expires
= DST_GC_MAX
;
128 dst_garbage
.timer_inc
+= DST_GC_INC
;
130 dst_garbage
.timer_inc
= DST_GC_INC
;
131 dst_garbage
.timer_expires
= DST_GC_MIN
;
133 expires
= dst_garbage
.timer_expires
;
135 * if the next desired timer is more than 4 seconds in the
136 * future then round the timer to whole seconds
139 expires
= round_jiffies_relative(expires
);
140 schedule_delayed_work(&dst_gc_work
, expires
);
143 spin_unlock_bh(&dst_garbage
.lock
);
144 mutex_unlock(&dst_gc_mutex
);
147 int dst_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
152 EXPORT_SYMBOL(dst_discard_out
);
154 const struct dst_metrics dst_default_metrics
= {
155 /* This initializer is needed to force linker to place this variable
156 * into const section. Otherwise it might end into bss section.
157 * We really want to avoid false sharing on this variable, and catch
160 .refcnt
= ATOMIC_INIT(1),
163 void dst_init(struct dst_entry
*dst
, struct dst_ops
*ops
,
164 struct net_device
*dev
, int initial_ref
, int initial_obsolete
,
165 unsigned short flags
)
172 dst_init_metrics(dst
, dst_default_metrics
.metrics
, true);
179 dst
->input
= dst_discard
;
180 dst
->output
= dst_discard_out
;
182 dst
->obsolete
= initial_obsolete
;
184 dst
->trailer_len
= 0;
185 #ifdef CONFIG_IP_ROUTE_CLASSID
188 dst
->lwtstate
= NULL
;
189 atomic_set(&dst
->__refcnt
, initial_ref
);
191 dst
->lastuse
= jiffies
;
193 dst
->pending_confirm
= 0;
195 if (!(flags
& DST_NOCOUNT
))
196 dst_entries_add(ops
, 1);
198 EXPORT_SYMBOL(dst_init
);
200 void *dst_alloc(struct dst_ops
*ops
, struct net_device
*dev
,
201 int initial_ref
, int initial_obsolete
, unsigned short flags
)
203 struct dst_entry
*dst
;
205 if (ops
->gc
&& dst_entries_get_fast(ops
) > ops
->gc_thresh
) {
210 dst
= kmem_cache_alloc(ops
->kmem_cachep
, GFP_ATOMIC
);
214 dst_init(dst
, ops
, dev
, initial_ref
, initial_obsolete
, flags
);
218 EXPORT_SYMBOL(dst_alloc
);
220 static void ___dst_free(struct dst_entry
*dst
)
222 /* The first case (dev==NULL) is required, when
223 protocol module is unloaded.
225 if (dst
->dev
== NULL
|| !(dst
->dev
->flags
&IFF_UP
)) {
226 dst
->input
= dst_discard
;
227 dst
->output
= dst_discard_out
;
229 dst
->obsolete
= DST_OBSOLETE_DEAD
;
232 void __dst_free(struct dst_entry
*dst
)
234 spin_lock_bh(&dst_garbage
.lock
);
236 dst
->next
= dst_garbage
.list
;
237 dst_garbage
.list
= dst
;
238 if (dst_garbage
.timer_inc
> DST_GC_INC
) {
239 dst_garbage
.timer_inc
= DST_GC_INC
;
240 dst_garbage
.timer_expires
= DST_GC_MIN
;
241 mod_delayed_work(system_wq
, &dst_gc_work
,
242 dst_garbage
.timer_expires
);
244 spin_unlock_bh(&dst_garbage
.lock
);
246 EXPORT_SYMBOL(__dst_free
);
248 struct dst_entry
*dst_destroy(struct dst_entry
* dst
)
250 struct dst_entry
*child
;
257 if (!(dst
->flags
& DST_NOCOUNT
))
258 dst_entries_add(dst
->ops
, -1);
260 if (dst
->ops
->destroy
)
261 dst
->ops
->destroy(dst
);
265 lwtstate_put(dst
->lwtstate
);
267 if (dst
->flags
& DST_METADATA
)
270 kmem_cache_free(dst
->ops
->kmem_cachep
, dst
);
274 int nohash
= dst
->flags
& DST_NOHASH
;
276 if (atomic_dec_and_test(&dst
->__refcnt
)) {
277 /* We were real parent of this dst, so kill child. */
281 /* Child is still referenced, return it for freeing. */
284 /* Child is still in his hash table */
289 EXPORT_SYMBOL(dst_destroy
);
291 static void dst_destroy_rcu(struct rcu_head
*head
)
293 struct dst_entry
*dst
= container_of(head
, struct dst_entry
, rcu_head
);
295 dst
= dst_destroy(dst
);
300 void dst_release(struct dst_entry
*dst
)
304 unsigned short nocache
= dst
->flags
& DST_NOCACHE
;
306 newrefcnt
= atomic_dec_return(&dst
->__refcnt
);
307 if (unlikely(newrefcnt
< 0))
308 net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
309 __func__
, dst
, newrefcnt
);
310 if (!newrefcnt
&& unlikely(nocache
))
311 call_rcu(&dst
->rcu_head
, dst_destroy_rcu
);
314 EXPORT_SYMBOL(dst_release
);
316 u32
*dst_cow_metrics_generic(struct dst_entry
*dst
, unsigned long old
)
318 struct dst_metrics
*p
= kmalloc(sizeof(*p
), GFP_ATOMIC
);
321 struct dst_metrics
*old_p
= (struct dst_metrics
*)__DST_METRICS_PTR(old
);
322 unsigned long prev
, new;
324 atomic_set(&p
->refcnt
, 1);
325 memcpy(p
->metrics
, old_p
->metrics
, sizeof(p
->metrics
));
327 new = (unsigned long) p
;
328 prev
= cmpxchg(&dst
->_metrics
, old
, new);
332 p
= (struct dst_metrics
*)__DST_METRICS_PTR(prev
);
333 if (prev
& DST_METRICS_READ_ONLY
)
335 } else if (prev
& DST_METRICS_REFCOUNTED
) {
336 if (atomic_dec_and_test(&old_p
->refcnt
))
340 BUILD_BUG_ON(offsetof(struct dst_metrics
, metrics
) != 0);
343 EXPORT_SYMBOL(dst_cow_metrics_generic
);
345 /* Caller asserts that dst_metrics_read_only(dst) is false. */
346 void __dst_destroy_metrics_generic(struct dst_entry
*dst
, unsigned long old
)
348 unsigned long prev
, new;
350 new = ((unsigned long) &dst_default_metrics
) | DST_METRICS_READ_ONLY
;
351 prev
= cmpxchg(&dst
->_metrics
, old
, new);
353 kfree(__DST_METRICS_PTR(old
));
355 EXPORT_SYMBOL(__dst_destroy_metrics_generic
);
357 static struct dst_ops md_dst_ops
= {
361 static int dst_md_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
363 WARN_ONCE(1, "Attempting to call output on metadata dst\n");
368 static int dst_md_discard(struct sk_buff
*skb
)
370 WARN_ONCE(1, "Attempting to call input on metadata dst\n");
375 static void __metadata_dst_init(struct metadata_dst
*md_dst
, u8 optslen
)
377 struct dst_entry
*dst
;
380 dst_init(dst
, &md_dst_ops
, NULL
, 1, DST_OBSOLETE_NONE
,
381 DST_METADATA
| DST_NOCACHE
| DST_NOCOUNT
);
383 dst
->input
= dst_md_discard
;
384 dst
->output
= dst_md_discard_out
;
386 memset(dst
+ 1, 0, sizeof(*md_dst
) + optslen
- sizeof(*dst
));
389 struct metadata_dst
*metadata_dst_alloc(u8 optslen
, gfp_t flags
)
391 struct metadata_dst
*md_dst
;
393 md_dst
= kmalloc(sizeof(*md_dst
) + optslen
, flags
);
397 __metadata_dst_init(md_dst
, optslen
);
401 EXPORT_SYMBOL_GPL(metadata_dst_alloc
);
403 struct metadata_dst __percpu
*metadata_dst_alloc_percpu(u8 optslen
, gfp_t flags
)
406 struct metadata_dst __percpu
*md_dst
;
408 md_dst
= __alloc_percpu_gfp(sizeof(struct metadata_dst
) + optslen
,
409 __alignof__(struct metadata_dst
), flags
);
413 for_each_possible_cpu(cpu
)
414 __metadata_dst_init(per_cpu_ptr(md_dst
, cpu
), optslen
);
418 EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu
);
420 /* Dirty hack. We did it in 2.2 (in __dst_free),
421 * we have _very_ good reasons not to repeat
422 * this mistake in 2.3, but we have no choice
423 * now. _It_ _is_ _explicit_ _deliberate_
424 * _race_ _condition_.
426 * Commented and originally written by Alexey.
428 static void dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
431 if (dst
->ops
->ifdown
)
432 dst
->ops
->ifdown(dst
, dev
, unregister
);
438 dst
->input
= dst_discard
;
439 dst
->output
= dst_discard_out
;
441 dst
->dev
= dev_net(dst
->dev
)->loopback_dev
;
447 static int dst_dev_event(struct notifier_block
*this, unsigned long event
,
450 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
451 struct dst_entry
*dst
, *last
= NULL
;
454 case NETDEV_UNREGISTER_FINAL
:
456 mutex_lock(&dst_gc_mutex
);
457 for (dst
= dst_busy_list
; dst
; dst
= dst
->next
) {
459 dst_ifdown(dst
, dev
, event
!= NETDEV_DOWN
);
462 spin_lock_bh(&dst_garbage
.lock
);
463 dst
= dst_garbage
.list
;
464 dst_garbage
.list
= NULL
;
465 /* The code in dst_ifdown places a hold on the loopback device.
466 * If the gc entry processing is set to expire after a lengthy
467 * interval, this hold can cause netdev_wait_allrefs() to hang
468 * out and wait for a long time -- until the the loopback
469 * interface is released. If we're really unlucky, it'll emit
470 * pr_emerg messages to console too. Reset the interval here,
471 * so dst cleanups occur in a more timely fashion.
473 if (dst_garbage
.timer_inc
> DST_GC_INC
) {
474 dst_garbage
.timer_inc
= DST_GC_INC
;
475 dst_garbage
.timer_expires
= DST_GC_MIN
;
476 mod_delayed_work(system_wq
, &dst_gc_work
,
477 dst_garbage
.timer_expires
);
479 spin_unlock_bh(&dst_garbage
.lock
);
485 for (; dst
; dst
= dst
->next
)
486 dst_ifdown(dst
, dev
, event
!= NETDEV_DOWN
);
487 mutex_unlock(&dst_gc_mutex
);
493 static struct notifier_block dst_dev_notifier
= {
494 .notifier_call
= dst_dev_event
,
495 .priority
= -10, /* must be called after other network notifiers */
498 void __init
dst_subsys_init(void)
500 register_netdevice_notifier(&dst_dev_notifier
);