1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * inet fragments management
5 * Authors: Pavel Emelyanov <xemul@openvz.org>
6 * Started as consolidation of ipv4/ip_fragment.c,
7 * ipv6/reassembly. and ipv6 nf conntrack reassembly
10 #include <linux/list.h>
11 #include <linux/spinlock.h>
12 #include <linux/module.h>
13 #include <linux/timer.h>
15 #include <linux/random.h>
16 #include <linux/skbuff.h>
17 #include <linux/rtnetlink.h>
18 #include <linux/slab.h>
19 #include <linux/rhashtable.h>
22 #include <net/inet_frag.h>
23 #include <net/inet_ecn.h>
27 /* Use skb->cb to track consecutive/adjacent fragments coming at
28 * the end of the queue. Nodes in the rb-tree queue will
29 * contain "runs" of one or more adjacent fragments.
32 * - next_frag is NULL at the tail of a "run";
33 * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
35 struct ipfrag_skb_cb
{
37 struct inet_skb_parm h4
;
38 struct inet6_skb_parm h6
;
40 struct sk_buff
*next_frag
;
44 #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
46 static void fragcb_clear(struct sk_buff
*skb
)
48 RB_CLEAR_NODE(&skb
->rbnode
);
49 FRAG_CB(skb
)->next_frag
= NULL
;
50 FRAG_CB(skb
)->frag_run_len
= skb
->len
;
53 /* Append skb to the last "run". */
54 static void fragrun_append_to_last(struct inet_frag_queue
*q
,
59 FRAG_CB(q
->last_run_head
)->frag_run_len
+= skb
->len
;
60 FRAG_CB(q
->fragments_tail
)->next_frag
= skb
;
61 q
->fragments_tail
= skb
;
64 /* Create a new "run" with the skb. */
65 static void fragrun_create(struct inet_frag_queue
*q
, struct sk_buff
*skb
)
67 BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb
) > sizeof(skb
->cb
));
71 rb_link_node(&skb
->rbnode
, &q
->last_run_head
->rbnode
,
72 &q
->last_run_head
->rbnode
.rb_right
);
74 rb_link_node(&skb
->rbnode
, NULL
, &q
->rb_fragments
.rb_node
);
75 rb_insert_color(&skb
->rbnode
, &q
->rb_fragments
);
77 q
->fragments_tail
= skb
;
78 q
->last_run_head
= skb
;
81 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
82 * Value : 0xff if frame should be dropped.
83 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
85 const u8 ip_frag_ecn_table
[16] = {
86 /* at least one fragment had CE, and others ECT_0 or ECT_1 */
87 [IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
] = INET_ECN_CE
,
88 [IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_1
] = INET_ECN_CE
,
89 [IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
| IPFRAG_ECN_ECT_1
] = INET_ECN_CE
,
91 /* invalid combinations : drop frame */
92 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
] = 0xff,
93 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_ECT_0
] = 0xff,
94 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_ECT_1
] = 0xff,
95 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_ECT_0
| IPFRAG_ECN_ECT_1
] = 0xff,
96 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
] = 0xff,
97 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_1
] = 0xff,
98 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
| IPFRAG_ECN_ECT_1
] = 0xff,
100 EXPORT_SYMBOL(ip_frag_ecn_table
);
102 int inet_frags_init(struct inet_frags
*f
)
104 f
->frags_cachep
= kmem_cache_create(f
->frags_cache_name
, f
->qsize
, 0, 0,
106 if (!f
->frags_cachep
)
109 refcount_set(&f
->refcnt
, 1);
110 init_completion(&f
->completion
);
113 EXPORT_SYMBOL(inet_frags_init
);
115 void inet_frags_fini(struct inet_frags
*f
)
117 if (refcount_dec_and_test(&f
->refcnt
))
118 complete(&f
->completion
);
120 wait_for_completion(&f
->completion
);
122 kmem_cache_destroy(f
->frags_cachep
);
123 f
->frags_cachep
= NULL
;
125 EXPORT_SYMBOL(inet_frags_fini
);
127 /* called from rhashtable_free_and_destroy() at netns_frags dismantle */
128 static void inet_frags_free_cb(void *ptr
, void *arg
)
130 struct inet_frag_queue
*fq
= ptr
;
133 count
= del_timer_sync(&fq
->timer
) ? 1 : 0;
135 spin_lock_bh(&fq
->lock
);
136 if (!(fq
->flags
& INET_FRAG_COMPLETE
)) {
137 fq
->flags
|= INET_FRAG_COMPLETE
;
139 } else if (fq
->flags
& INET_FRAG_HASH_DEAD
) {
142 spin_unlock_bh(&fq
->lock
);
144 if (refcount_sub_and_test(count
, &fq
->refcnt
))
145 inet_frag_destroy(fq
);
148 static LLIST_HEAD(fqdir_free_list
);
150 static void fqdir_free_fn(struct work_struct
*work
)
152 struct llist_node
*kill_list
;
153 struct fqdir
*fqdir
, *tmp
;
154 struct inet_frags
*f
;
156 /* Atomically snapshot the list of fqdirs to free */
157 kill_list
= llist_del_all(&fqdir_free_list
);
159 /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
160 * have completed, since they need to dereference fqdir.
161 * Would it not be nice to have kfree_rcu_barrier() ? :)
165 llist_for_each_entry_safe(fqdir
, tmp
, kill_list
, free_list
) {
167 if (refcount_dec_and_test(&f
->refcnt
))
168 complete(&f
->completion
);
174 static DECLARE_WORK(fqdir_free_work
, fqdir_free_fn
);
176 static void fqdir_work_fn(struct work_struct
*work
)
178 struct fqdir
*fqdir
= container_of(work
, struct fqdir
, destroy_work
);
180 rhashtable_free_and_destroy(&fqdir
->rhashtable
, inet_frags_free_cb
, NULL
);
182 if (llist_add(&fqdir
->free_list
, &fqdir_free_list
))
183 queue_work(system_wq
, &fqdir_free_work
);
186 int fqdir_init(struct fqdir
**fqdirp
, struct inet_frags
*f
, struct net
*net
)
188 struct fqdir
*fqdir
= kzalloc(sizeof(*fqdir
), GFP_KERNEL
);
195 res
= rhashtable_init(&fqdir
->rhashtable
, &fqdir
->f
->rhash_params
);
200 refcount_inc(&f
->refcnt
);
204 EXPORT_SYMBOL(fqdir_init
);
206 static struct workqueue_struct
*inet_frag_wq
;
208 static int __init
inet_frag_wq_init(void)
210 inet_frag_wq
= create_workqueue("inet_frag_wq");
212 panic("Could not create inet frag workq");
216 pure_initcall(inet_frag_wq_init
);
218 void fqdir_exit(struct fqdir
*fqdir
)
220 INIT_WORK(&fqdir
->destroy_work
, fqdir_work_fn
);
221 queue_work(inet_frag_wq
, &fqdir
->destroy_work
);
223 EXPORT_SYMBOL(fqdir_exit
);
225 void inet_frag_kill(struct inet_frag_queue
*fq
)
227 if (del_timer(&fq
->timer
))
228 refcount_dec(&fq
->refcnt
);
230 if (!(fq
->flags
& INET_FRAG_COMPLETE
)) {
231 struct fqdir
*fqdir
= fq
->fqdir
;
233 fq
->flags
|= INET_FRAG_COMPLETE
;
235 /* The RCU read lock provides a memory barrier
236 * guaranteeing that if fqdir->dead is false then
237 * the hash table destruction will not start until
238 * after we unlock. Paired with inet_frags_exit_net().
241 rhashtable_remove_fast(&fqdir
->rhashtable
, &fq
->node
,
242 fqdir
->f
->rhash_params
);
243 refcount_dec(&fq
->refcnt
);
245 fq
->flags
|= INET_FRAG_HASH_DEAD
;
250 EXPORT_SYMBOL(inet_frag_kill
);
252 static void inet_frag_destroy_rcu(struct rcu_head
*head
)
254 struct inet_frag_queue
*q
= container_of(head
, struct inet_frag_queue
,
256 struct inet_frags
*f
= q
->fqdir
->f
;
260 kmem_cache_free(f
->frags_cachep
, q
);
263 unsigned int inet_frag_rbtree_purge(struct rb_root
*root
)
265 struct rb_node
*p
= rb_first(root
);
266 unsigned int sum
= 0;
269 struct sk_buff
*skb
= rb_entry(p
, struct sk_buff
, rbnode
);
272 rb_erase(&skb
->rbnode
, root
);
274 struct sk_buff
*next
= FRAG_CB(skb
)->next_frag
;
276 sum
+= skb
->truesize
;
283 EXPORT_SYMBOL(inet_frag_rbtree_purge
);
285 void inet_frag_destroy(struct inet_frag_queue
*q
)
288 unsigned int sum
, sum_truesize
= 0;
289 struct inet_frags
*f
;
291 WARN_ON(!(q
->flags
& INET_FRAG_COMPLETE
));
292 WARN_ON(del_timer(&q
->timer
) != 0);
294 /* Release all fragment data. */
297 sum_truesize
= inet_frag_rbtree_purge(&q
->rb_fragments
);
298 sum
= sum_truesize
+ f
->qsize
;
300 call_rcu(&q
->rcu
, inet_frag_destroy_rcu
);
302 sub_frag_mem_limit(fqdir
, sum
);
304 EXPORT_SYMBOL(inet_frag_destroy
);
306 static struct inet_frag_queue
*inet_frag_alloc(struct fqdir
*fqdir
,
307 struct inet_frags
*f
,
310 struct inet_frag_queue
*q
;
312 q
= kmem_cache_zalloc(f
->frags_cachep
, GFP_ATOMIC
);
317 f
->constructor(q
, arg
);
318 add_frag_mem_limit(fqdir
, f
->qsize
);
320 timer_setup(&q
->timer
, f
->frag_expire
, 0);
321 spin_lock_init(&q
->lock
);
322 refcount_set(&q
->refcnt
, 3);
327 static struct inet_frag_queue
*inet_frag_create(struct fqdir
*fqdir
,
329 struct inet_frag_queue
**prev
)
331 struct inet_frags
*f
= fqdir
->f
;
332 struct inet_frag_queue
*q
;
334 q
= inet_frag_alloc(fqdir
, f
, arg
);
336 *prev
= ERR_PTR(-ENOMEM
);
339 mod_timer(&q
->timer
, jiffies
+ fqdir
->timeout
);
341 *prev
= rhashtable_lookup_get_insert_key(&fqdir
->rhashtable
, &q
->key
,
342 &q
->node
, f
->rhash_params
);
344 q
->flags
|= INET_FRAG_COMPLETE
;
346 inet_frag_destroy(q
);
352 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
353 struct inet_frag_queue
*inet_frag_find(struct fqdir
*fqdir
, void *key
)
355 struct inet_frag_queue
*fq
= NULL
, *prev
;
357 if (!fqdir
->high_thresh
|| frag_mem_limit(fqdir
) > fqdir
->high_thresh
)
362 prev
= rhashtable_lookup(&fqdir
->rhashtable
, key
, fqdir
->f
->rhash_params
);
364 fq
= inet_frag_create(fqdir
, key
, &prev
);
365 if (!IS_ERR_OR_NULL(prev
)) {
367 if (!refcount_inc_not_zero(&fq
->refcnt
))
373 EXPORT_SYMBOL(inet_frag_find
);
375 int inet_frag_queue_insert(struct inet_frag_queue
*q
, struct sk_buff
*skb
,
378 struct sk_buff
*last
= q
->fragments_tail
;
380 /* RFC5722, Section 4, amended by Errata ID : 3089
381 * When reassembling an IPv6 datagram, if
382 * one or more its constituent fragments is determined to be an
383 * overlapping fragment, the entire datagram (and any constituent
384 * fragments) MUST be silently discarded.
386 * Duplicates, however, should be ignored (i.e. skb dropped, but the
387 * queue/fragments kept for later reassembly).
390 fragrun_create(q
, skb
); /* First fragment. */
391 else if (last
->ip_defrag_offset
+ last
->len
< end
) {
392 /* This is the common case: skb goes to the end. */
393 /* Detect and discard overlaps. */
394 if (offset
< last
->ip_defrag_offset
+ last
->len
)
395 return IPFRAG_OVERLAP
;
396 if (offset
== last
->ip_defrag_offset
+ last
->len
)
397 fragrun_append_to_last(q
, skb
);
399 fragrun_create(q
, skb
);
401 /* Binary search. Note that skb can become the first fragment,
402 * but not the last (covered above).
404 struct rb_node
**rbn
, *parent
;
406 rbn
= &q
->rb_fragments
.rb_node
;
408 struct sk_buff
*curr
;
412 curr
= rb_to_skb(parent
);
413 curr_run_end
= curr
->ip_defrag_offset
+
414 FRAG_CB(curr
)->frag_run_len
;
415 if (end
<= curr
->ip_defrag_offset
)
416 rbn
= &parent
->rb_left
;
417 else if (offset
>= curr_run_end
)
418 rbn
= &parent
->rb_right
;
419 else if (offset
>= curr
->ip_defrag_offset
&&
423 return IPFRAG_OVERLAP
;
425 /* Here we have parent properly set, and rbn pointing to
426 * one of its NULL left/right children. Insert skb.
429 rb_link_node(&skb
->rbnode
, parent
, rbn
);
430 rb_insert_color(&skb
->rbnode
, &q
->rb_fragments
);
433 skb
->ip_defrag_offset
= offset
;
437 EXPORT_SYMBOL(inet_frag_queue_insert
);
439 void *inet_frag_reasm_prepare(struct inet_frag_queue
*q
, struct sk_buff
*skb
,
440 struct sk_buff
*parent
)
442 struct sk_buff
*fp
, *head
= skb_rb_first(&q
->rb_fragments
);
443 struct sk_buff
**nextp
;
447 fp
= skb_clone(skb
, GFP_ATOMIC
);
450 FRAG_CB(fp
)->next_frag
= FRAG_CB(skb
)->next_frag
;
451 if (RB_EMPTY_NODE(&skb
->rbnode
))
452 FRAG_CB(parent
)->next_frag
= fp
;
454 rb_replace_node(&skb
->rbnode
, &fp
->rbnode
,
456 if (q
->fragments_tail
== skb
)
457 q
->fragments_tail
= fp
;
458 skb_morph(skb
, head
);
459 FRAG_CB(skb
)->next_frag
= FRAG_CB(head
)->next_frag
;
460 rb_replace_node(&head
->rbnode
, &skb
->rbnode
,
465 WARN_ON(head
->ip_defrag_offset
!= 0);
467 delta
= -head
->truesize
;
469 /* Head of list must not be cloned. */
470 if (skb_unclone(head
, GFP_ATOMIC
))
473 delta
+= head
->truesize
;
475 add_frag_mem_limit(q
->fqdir
, delta
);
477 /* If the first fragment is fragmented itself, we split
478 * it to two chunks: the first with data and paged part
479 * and the second, holding only fragments.
481 if (skb_has_frag_list(head
)) {
482 struct sk_buff
*clone
;
485 clone
= alloc_skb(0, GFP_ATOMIC
);
488 skb_shinfo(clone
)->frag_list
= skb_shinfo(head
)->frag_list
;
489 skb_frag_list_init(head
);
490 for (i
= 0; i
< skb_shinfo(head
)->nr_frags
; i
++)
491 plen
+= skb_frag_size(&skb_shinfo(head
)->frags
[i
]);
492 clone
->data_len
= head
->data_len
- plen
;
493 clone
->len
= clone
->data_len
;
494 head
->truesize
+= clone
->truesize
;
496 clone
->ip_summed
= head
->ip_summed
;
497 add_frag_mem_limit(q
->fqdir
, clone
->truesize
);
498 skb_shinfo(head
)->frag_list
= clone
;
499 nextp
= &clone
->next
;
501 nextp
= &skb_shinfo(head
)->frag_list
;
506 EXPORT_SYMBOL(inet_frag_reasm_prepare
);
508 void inet_frag_reasm_finish(struct inet_frag_queue
*q
, struct sk_buff
*head
,
509 void *reasm_data
, bool try_coalesce
)
511 struct sk_buff
**nextp
= (struct sk_buff
**)reasm_data
;
516 skb_push(head
, head
->data
- skb_network_header(head
));
518 /* Traverse the tree in order, to build frag_list. */
519 fp
= FRAG_CB(head
)->next_frag
;
520 rbn
= rb_next(&head
->rbnode
);
521 rb_erase(&head
->rbnode
, &q
->rb_fragments
);
523 sum_truesize
= head
->truesize
;
525 /* fp points to the next sk_buff in the current run;
526 * rbn points to the next run.
528 /* Go through the current run. */
530 struct sk_buff
*next_frag
= FRAG_CB(fp
)->next_frag
;
534 sum_truesize
+= fp
->truesize
;
535 if (head
->ip_summed
!= fp
->ip_summed
)
536 head
->ip_summed
= CHECKSUM_NONE
;
537 else if (head
->ip_summed
== CHECKSUM_COMPLETE
)
538 head
->csum
= csum_add(head
->csum
, fp
->csum
);
540 if (try_coalesce
&& skb_try_coalesce(head
, fp
, &stolen
,
542 kfree_skb_partial(fp
, stolen
);
545 memset(&fp
->rbnode
, 0, sizeof(fp
->rbnode
));
548 head
->data_len
+= fp
->len
;
549 head
->len
+= fp
->len
;
550 head
->truesize
+= fp
->truesize
;
558 /* Move to the next run. */
560 struct rb_node
*rbnext
= rb_next(rbn
);
563 rb_erase(rbn
, &q
->rb_fragments
);
567 sub_frag_mem_limit(q
->fqdir
, sum_truesize
);
570 skb_mark_not_on_list(head
);
572 head
->tstamp
= q
->stamp
;
574 EXPORT_SYMBOL(inet_frag_reasm_finish
);
576 struct sk_buff
*inet_frag_pull_head(struct inet_frag_queue
*q
)
578 struct sk_buff
*head
, *skb
;
580 head
= skb_rb_first(&q
->rb_fragments
);
583 skb
= FRAG_CB(head
)->next_frag
;
585 rb_replace_node(&head
->rbnode
, &skb
->rbnode
,
588 rb_erase(&head
->rbnode
, &q
->rb_fragments
);
589 memset(&head
->rbnode
, 0, sizeof(head
->rbnode
));
592 if (head
== q
->fragments_tail
)
593 q
->fragments_tail
= NULL
;
595 sub_frag_mem_limit(q
->fqdir
, head
->truesize
);
599 EXPORT_SYMBOL(inet_frag_pull_head
);