2 * net/core/dst.c Protocol independent destination cache.
4 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
8 #include <linux/bitops.h>
9 #include <linux/errno.h>
10 #include <linux/init.h>
11 #include <linux/kernel.h>
12 #include <linux/workqueue.h>
14 #include <linux/module.h>
15 #include <linux/slab.h>
16 #include <linux/netdevice.h>
17 #include <linux/skbuff.h>
18 #include <linux/string.h>
19 #include <linux/types.h>
20 #include <net/net_namespace.h>
21 #include <linux/sched.h>
22 #include <linux/prefetch.h>
23 #include <net/lwtunnel.h>
26 #include <net/dst_metadata.h>
29 * Theory of operations:
30 * 1) We use a list, protected by a spinlock, to add
31 * new entries from both BH and non-BH context.
32 * 2) In order to keep spinlock held for a small delay,
33 * we use a second list where are stored long lived
34 * entries, that are handled by the garbage collect thread
35 * fired by a workqueue.
36 * 3) This list is guarded by a mutex,
37 * so that the gc_task and dst_dev_event() can be synchronized.
41 * We want to keep lock & list close together
42 * to dirty as few cache lines as possible in __dst_free().
43 * As this is not a very strong hint, we dont force an alignment on SMP.
47 struct dst_entry
*list
;
48 unsigned long timer_inc
;
49 unsigned long timer_expires
;
51 .lock
= __SPIN_LOCK_UNLOCKED(dst_garbage
.lock
),
52 .timer_inc
= DST_GC_MAX
,
54 static void dst_gc_task(struct work_struct
*work
);
55 static void ___dst_free(struct dst_entry
*dst
);
57 static DECLARE_DELAYED_WORK(dst_gc_work
, dst_gc_task
);
59 static DEFINE_MUTEX(dst_gc_mutex
);
61 * long lived entries are maintained in this list, guarded by dst_gc_mutex
63 static struct dst_entry
*dst_busy_list
;
65 static void dst_gc_task(struct work_struct
*work
)
68 int work_performed
= 0;
69 unsigned long expires
= ~0L;
70 struct dst_entry
*dst
, *next
, head
;
71 struct dst_entry
*last
= &head
;
73 mutex_lock(&dst_gc_mutex
);
77 while ((dst
= next
) != NULL
) {
79 prefetch(&next
->next
);
81 if (likely(atomic_read(&dst
->__refcnt
))) {
89 dst
= dst_destroy(dst
);
91 /* NOHASH and still referenced. Unless it is already
92 * on gc list, invalidate it and add to gc list.
94 * Note: this is temporary. Actually, NOHASH dst's
95 * must be obsoleted when parent is obsoleted.
96 * But we do not have state "obsoleted, but
97 * referenced by parent", so it is right.
99 if (dst
->obsolete
> 0)
108 spin_lock_bh(&dst_garbage
.lock
);
109 next
= dst_garbage
.list
;
111 dst_garbage
.list
= NULL
;
112 spin_unlock_bh(&dst_garbage
.lock
);
116 dst_busy_list
= head
.next
;
118 dst_garbage
.timer_inc
= DST_GC_MAX
;
121 * if we freed less than 1/10 of delayed entries,
122 * we can sleep longer.
124 if (work_performed
<= delayed
/10) {
125 dst_garbage
.timer_expires
+= dst_garbage
.timer_inc
;
126 if (dst_garbage
.timer_expires
> DST_GC_MAX
)
127 dst_garbage
.timer_expires
= DST_GC_MAX
;
128 dst_garbage
.timer_inc
+= DST_GC_INC
;
130 dst_garbage
.timer_inc
= DST_GC_INC
;
131 dst_garbage
.timer_expires
= DST_GC_MIN
;
133 expires
= dst_garbage
.timer_expires
;
135 * if the next desired timer is more than 4 seconds in the
136 * future then round the timer to whole seconds
139 expires
= round_jiffies_relative(expires
);
140 schedule_delayed_work(&dst_gc_work
, expires
);
143 spin_unlock_bh(&dst_garbage
.lock
);
144 mutex_unlock(&dst_gc_mutex
);
147 int dst_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
152 EXPORT_SYMBOL(dst_discard_out
);
154 const u32 dst_default_metrics
[RTAX_MAX
+ 1] = {
155 /* This initializer is needed to force linker to place this variable
156 * into const section. Otherwise it might end into bss section.
157 * We really want to avoid false sharing on this variable, and catch
160 [RTAX_MAX
] = 0xdeadbeef,
163 void dst_init(struct dst_entry
*dst
, struct dst_ops
*ops
,
164 struct net_device
*dev
, int initial_ref
, int initial_obsolete
,
165 unsigned short flags
)
172 dst_init_metrics(dst
, dst_default_metrics
, true);
179 dst
->input
= dst_discard
;
180 dst
->output
= dst_discard_out
;
182 dst
->obsolete
= initial_obsolete
;
184 dst
->trailer_len
= 0;
185 #ifdef CONFIG_IP_ROUTE_CLASSID
188 dst
->lwtstate
= NULL
;
189 atomic_set(&dst
->__refcnt
, initial_ref
);
191 dst
->lastuse
= jiffies
;
193 dst
->pending_confirm
= 0;
195 if (!(flags
& DST_NOCOUNT
))
196 dst_entries_add(ops
, 1);
198 EXPORT_SYMBOL(dst_init
);
200 void *dst_alloc(struct dst_ops
*ops
, struct net_device
*dev
,
201 int initial_ref
, int initial_obsolete
, unsigned short flags
)
203 struct dst_entry
*dst
;
205 if (ops
->gc
&& dst_entries_get_fast(ops
) > ops
->gc_thresh
) {
210 dst
= kmem_cache_alloc(ops
->kmem_cachep
, GFP_ATOMIC
);
214 dst_init(dst
, ops
, dev
, initial_ref
, initial_obsolete
, flags
);
218 EXPORT_SYMBOL(dst_alloc
);
220 static void ___dst_free(struct dst_entry
*dst
)
222 /* The first case (dev==NULL) is required, when
223 protocol module is unloaded.
225 if (dst
->dev
== NULL
|| !(dst
->dev
->flags
&IFF_UP
)) {
226 dst
->input
= dst_discard
;
227 dst
->output
= dst_discard_out
;
229 dst
->obsolete
= DST_OBSOLETE_DEAD
;
232 void __dst_free(struct dst_entry
*dst
)
234 spin_lock_bh(&dst_garbage
.lock
);
236 dst
->next
= dst_garbage
.list
;
237 dst_garbage
.list
= dst
;
238 if (dst_garbage
.timer_inc
> DST_GC_INC
) {
239 dst_garbage
.timer_inc
= DST_GC_INC
;
240 dst_garbage
.timer_expires
= DST_GC_MIN
;
241 mod_delayed_work(system_wq
, &dst_gc_work
,
242 dst_garbage
.timer_expires
);
244 spin_unlock_bh(&dst_garbage
.lock
);
246 EXPORT_SYMBOL(__dst_free
);
248 struct dst_entry
*dst_destroy(struct dst_entry
* dst
)
250 struct dst_entry
*child
;
257 if (!(dst
->flags
& DST_NOCOUNT
))
258 dst_entries_add(dst
->ops
, -1);
260 if (dst
->ops
->destroy
)
261 dst
->ops
->destroy(dst
);
265 lwtstate_put(dst
->lwtstate
);
267 if (dst
->flags
& DST_METADATA
)
270 kmem_cache_free(dst
->ops
->kmem_cachep
, dst
);
274 int nohash
= dst
->flags
& DST_NOHASH
;
276 if (atomic_dec_and_test(&dst
->__refcnt
)) {
277 /* We were real parent of this dst, so kill child. */
281 /* Child is still referenced, return it for freeing. */
284 /* Child is still in his hash table */
289 EXPORT_SYMBOL(dst_destroy
);
291 static void dst_destroy_rcu(struct rcu_head
*head
)
293 struct dst_entry
*dst
= container_of(head
, struct dst_entry
, rcu_head
);
295 dst
= dst_destroy(dst
);
300 void dst_release(struct dst_entry
*dst
)
305 newrefcnt
= atomic_dec_return(&dst
->__refcnt
);
306 if (unlikely(newrefcnt
< 0))
307 net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
308 __func__
, dst
, newrefcnt
);
309 if (!newrefcnt
&& unlikely(dst
->flags
& DST_NOCACHE
))
310 call_rcu(&dst
->rcu_head
, dst_destroy_rcu
);
313 EXPORT_SYMBOL(dst_release
);
315 u32
*dst_cow_metrics_generic(struct dst_entry
*dst
, unsigned long old
)
317 u32
*p
= kmalloc(sizeof(u32
) * RTAX_MAX
, GFP_ATOMIC
);
320 u32
*old_p
= __DST_METRICS_PTR(old
);
321 unsigned long prev
, new;
323 memcpy(p
, old_p
, sizeof(u32
) * RTAX_MAX
);
325 new = (unsigned long) p
;
326 prev
= cmpxchg(&dst
->_metrics
, old
, new);
330 p
= __DST_METRICS_PTR(prev
);
331 if (prev
& DST_METRICS_READ_ONLY
)
337 EXPORT_SYMBOL(dst_cow_metrics_generic
);
339 /* Caller asserts that dst_metrics_read_only(dst) is false. */
340 void __dst_destroy_metrics_generic(struct dst_entry
*dst
, unsigned long old
)
342 unsigned long prev
, new;
344 new = ((unsigned long) dst_default_metrics
) | DST_METRICS_READ_ONLY
;
345 prev
= cmpxchg(&dst
->_metrics
, old
, new);
347 kfree(__DST_METRICS_PTR(old
));
349 EXPORT_SYMBOL(__dst_destroy_metrics_generic
);
351 static struct dst_ops md_dst_ops
= {
355 static int dst_md_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
357 WARN_ONCE(1, "Attempting to call output on metadata dst\n");
362 static int dst_md_discard(struct sk_buff
*skb
)
364 WARN_ONCE(1, "Attempting to call input on metadata dst\n");
369 static void __metadata_dst_init(struct metadata_dst
*md_dst
, u8 optslen
)
371 struct dst_entry
*dst
;
374 dst_init(dst
, &md_dst_ops
, NULL
, 1, DST_OBSOLETE_NONE
,
375 DST_METADATA
| DST_NOCACHE
| DST_NOCOUNT
);
377 dst
->input
= dst_md_discard
;
378 dst
->output
= dst_md_discard_out
;
380 memset(dst
+ 1, 0, sizeof(*md_dst
) + optslen
- sizeof(*dst
));
383 struct metadata_dst
*metadata_dst_alloc(u8 optslen
, gfp_t flags
)
385 struct metadata_dst
*md_dst
;
387 md_dst
= kmalloc(sizeof(*md_dst
) + optslen
, flags
);
391 __metadata_dst_init(md_dst
, optslen
);
395 EXPORT_SYMBOL_GPL(metadata_dst_alloc
);
397 struct metadata_dst __percpu
*metadata_dst_alloc_percpu(u8 optslen
, gfp_t flags
)
400 struct metadata_dst __percpu
*md_dst
;
402 md_dst
= __alloc_percpu_gfp(sizeof(struct metadata_dst
) + optslen
,
403 __alignof__(struct metadata_dst
), flags
);
407 for_each_possible_cpu(cpu
)
408 __metadata_dst_init(per_cpu_ptr(md_dst
, cpu
), optslen
);
412 EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu
);
414 /* Dirty hack. We did it in 2.2 (in __dst_free),
415 * we have _very_ good reasons not to repeat
416 * this mistake in 2.3, but we have no choice
417 * now. _It_ _is_ _explicit_ _deliberate_
418 * _race_ _condition_.
420 * Commented and originally written by Alexey.
422 static void dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
425 if (dst
->ops
->ifdown
)
426 dst
->ops
->ifdown(dst
, dev
, unregister
);
432 dst
->input
= dst_discard
;
433 dst
->output
= dst_discard_out
;
435 dst
->dev
= dev_net(dst
->dev
)->loopback_dev
;
441 static int dst_dev_event(struct notifier_block
*this, unsigned long event
,
444 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
445 struct dst_entry
*dst
, *last
= NULL
;
448 case NETDEV_UNREGISTER_FINAL
:
450 mutex_lock(&dst_gc_mutex
);
451 for (dst
= dst_busy_list
; dst
; dst
= dst
->next
) {
453 dst_ifdown(dst
, dev
, event
!= NETDEV_DOWN
);
456 spin_lock_bh(&dst_garbage
.lock
);
457 dst
= dst_garbage
.list
;
458 dst_garbage
.list
= NULL
;
459 spin_unlock_bh(&dst_garbage
.lock
);
465 for (; dst
; dst
= dst
->next
)
466 dst_ifdown(dst
, dev
, event
!= NETDEV_DOWN
);
467 mutex_unlock(&dst_gc_mutex
);
473 static struct notifier_block dst_dev_notifier
= {
474 .notifier_call
= dst_dev_event
,
475 .priority
= -10, /* must be called after other network notifiers */
478 void __init
dst_subsys_init(void)
480 register_netdevice_notifier(&dst_dev_notifier
);