1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * inet fragments management
5 * Authors: Pavel Emelyanov <xemul@openvz.org>
6 * Started as consolidation of ipv4/ip_fragment.c,
7 * ipv6/reassembly. and ipv6 nf conntrack reassembly
10 #include <linux/list.h>
11 #include <linux/spinlock.h>
12 #include <linux/module.h>
13 #include <linux/timer.h>
15 #include <linux/random.h>
16 #include <linux/skbuff.h>
17 #include <linux/rtnetlink.h>
18 #include <linux/slab.h>
19 #include <linux/rhashtable.h>
22 #include <net/inet_frag.h>
23 #include <net/inet_ecn.h>
27 /* Use skb->cb to track consecutive/adjacent fragments coming at
28 * the end of the queue. Nodes in the rb-tree queue will
29 * contain "runs" of one or more adjacent fragments.
32 * - next_frag is NULL at the tail of a "run";
33 * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
35 struct ipfrag_skb_cb
{
37 struct inet_skb_parm h4
;
38 struct inet6_skb_parm h6
;
40 struct sk_buff
*next_frag
;
44 #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
46 static void fragcb_clear(struct sk_buff
*skb
)
48 RB_CLEAR_NODE(&skb
->rbnode
);
49 FRAG_CB(skb
)->next_frag
= NULL
;
50 FRAG_CB(skb
)->frag_run_len
= skb
->len
;
53 /* Append skb to the last "run". */
54 static void fragrun_append_to_last(struct inet_frag_queue
*q
,
59 FRAG_CB(q
->last_run_head
)->frag_run_len
+= skb
->len
;
60 FRAG_CB(q
->fragments_tail
)->next_frag
= skb
;
61 q
->fragments_tail
= skb
;
64 /* Create a new "run" with the skb. */
65 static void fragrun_create(struct inet_frag_queue
*q
, struct sk_buff
*skb
)
67 BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb
) > sizeof(skb
->cb
));
71 rb_link_node(&skb
->rbnode
, &q
->last_run_head
->rbnode
,
72 &q
->last_run_head
->rbnode
.rb_right
);
74 rb_link_node(&skb
->rbnode
, NULL
, &q
->rb_fragments
.rb_node
);
75 rb_insert_color(&skb
->rbnode
, &q
->rb_fragments
);
77 q
->fragments_tail
= skb
;
78 q
->last_run_head
= skb
;
81 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
82 * Value : 0xff if frame should be dropped.
83 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
85 const u8 ip_frag_ecn_table
[16] = {
86 /* at least one fragment had CE, and others ECT_0 or ECT_1 */
87 [IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
] = INET_ECN_CE
,
88 [IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_1
] = INET_ECN_CE
,
89 [IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
| IPFRAG_ECN_ECT_1
] = INET_ECN_CE
,
91 /* invalid combinations : drop frame */
92 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
] = 0xff,
93 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_ECT_0
] = 0xff,
94 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_ECT_1
] = 0xff,
95 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_ECT_0
| IPFRAG_ECN_ECT_1
] = 0xff,
96 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
] = 0xff,
97 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_1
] = 0xff,
98 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
| IPFRAG_ECN_ECT_1
] = 0xff,
100 EXPORT_SYMBOL(ip_frag_ecn_table
);
102 int inet_frags_init(struct inet_frags
*f
)
104 f
->frags_cachep
= kmem_cache_create(f
->frags_cache_name
, f
->qsize
, 0, 0,
106 if (!f
->frags_cachep
)
109 refcount_set(&f
->refcnt
, 1);
110 init_completion(&f
->completion
);
113 EXPORT_SYMBOL(inet_frags_init
);
115 void inet_frags_fini(struct inet_frags
*f
)
117 if (refcount_dec_and_test(&f
->refcnt
))
118 complete(&f
->completion
);
120 wait_for_completion(&f
->completion
);
122 kmem_cache_destroy(f
->frags_cachep
);
123 f
->frags_cachep
= NULL
;
125 EXPORT_SYMBOL(inet_frags_fini
);
127 /* called from rhashtable_free_and_destroy() at netns_frags dismantle */
128 static void inet_frags_free_cb(void *ptr
, void *arg
)
130 struct inet_frag_queue
*fq
= ptr
;
133 count
= del_timer_sync(&fq
->timer
) ? 1 : 0;
135 spin_lock_bh(&fq
->lock
);
136 if (!(fq
->flags
& INET_FRAG_COMPLETE
)) {
137 fq
->flags
|= INET_FRAG_COMPLETE
;
139 } else if (fq
->flags
& INET_FRAG_HASH_DEAD
) {
142 spin_unlock_bh(&fq
->lock
);
144 if (refcount_sub_and_test(count
, &fq
->refcnt
))
145 inet_frag_destroy(fq
);
148 static void fqdir_work_fn(struct work_struct
*work
)
150 struct fqdir
*fqdir
= container_of(work
, struct fqdir
, destroy_work
);
151 struct inet_frags
*f
= fqdir
->f
;
153 rhashtable_free_and_destroy(&fqdir
->rhashtable
, inet_frags_free_cb
, NULL
);
155 /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
156 * have completed, since they need to dereference fqdir.
157 * Would it not be nice to have kfree_rcu_barrier() ? :)
161 if (refcount_dec_and_test(&f
->refcnt
))
162 complete(&f
->completion
);
167 int fqdir_init(struct fqdir
**fqdirp
, struct inet_frags
*f
, struct net
*net
)
169 struct fqdir
*fqdir
= kzalloc(sizeof(*fqdir
), GFP_KERNEL
);
176 res
= rhashtable_init(&fqdir
->rhashtable
, &fqdir
->f
->rhash_params
);
181 refcount_inc(&f
->refcnt
);
185 EXPORT_SYMBOL(fqdir_init
);
187 void fqdir_exit(struct fqdir
*fqdir
)
189 INIT_WORK(&fqdir
->destroy_work
, fqdir_work_fn
);
190 queue_work(system_wq
, &fqdir
->destroy_work
);
192 EXPORT_SYMBOL(fqdir_exit
);
194 void inet_frag_kill(struct inet_frag_queue
*fq
)
196 if (del_timer(&fq
->timer
))
197 refcount_dec(&fq
->refcnt
);
199 if (!(fq
->flags
& INET_FRAG_COMPLETE
)) {
200 struct fqdir
*fqdir
= fq
->fqdir
;
202 fq
->flags
|= INET_FRAG_COMPLETE
;
204 /* The RCU read lock provides a memory barrier
205 * guaranteeing that if fqdir->dead is false then
206 * the hash table destruction will not start until
207 * after we unlock. Paired with inet_frags_exit_net().
210 rhashtable_remove_fast(&fqdir
->rhashtable
, &fq
->node
,
211 fqdir
->f
->rhash_params
);
212 refcount_dec(&fq
->refcnt
);
214 fq
->flags
|= INET_FRAG_HASH_DEAD
;
219 EXPORT_SYMBOL(inet_frag_kill
);
221 static void inet_frag_destroy_rcu(struct rcu_head
*head
)
223 struct inet_frag_queue
*q
= container_of(head
, struct inet_frag_queue
,
225 struct inet_frags
*f
= q
->fqdir
->f
;
229 kmem_cache_free(f
->frags_cachep
, q
);
232 unsigned int inet_frag_rbtree_purge(struct rb_root
*root
)
234 struct rb_node
*p
= rb_first(root
);
235 unsigned int sum
= 0;
238 struct sk_buff
*skb
= rb_entry(p
, struct sk_buff
, rbnode
);
241 rb_erase(&skb
->rbnode
, root
);
243 struct sk_buff
*next
= FRAG_CB(skb
)->next_frag
;
245 sum
+= skb
->truesize
;
252 EXPORT_SYMBOL(inet_frag_rbtree_purge
);
254 void inet_frag_destroy(struct inet_frag_queue
*q
)
257 unsigned int sum
, sum_truesize
= 0;
258 struct inet_frags
*f
;
260 WARN_ON(!(q
->flags
& INET_FRAG_COMPLETE
));
261 WARN_ON(del_timer(&q
->timer
) != 0);
263 /* Release all fragment data. */
266 sum_truesize
= inet_frag_rbtree_purge(&q
->rb_fragments
);
267 sum
= sum_truesize
+ f
->qsize
;
269 call_rcu(&q
->rcu
, inet_frag_destroy_rcu
);
271 sub_frag_mem_limit(fqdir
, sum
);
273 EXPORT_SYMBOL(inet_frag_destroy
);
275 static struct inet_frag_queue
*inet_frag_alloc(struct fqdir
*fqdir
,
276 struct inet_frags
*f
,
279 struct inet_frag_queue
*q
;
281 q
= kmem_cache_zalloc(f
->frags_cachep
, GFP_ATOMIC
);
286 f
->constructor(q
, arg
);
287 add_frag_mem_limit(fqdir
, f
->qsize
);
289 timer_setup(&q
->timer
, f
->frag_expire
, 0);
290 spin_lock_init(&q
->lock
);
291 refcount_set(&q
->refcnt
, 3);
296 static struct inet_frag_queue
*inet_frag_create(struct fqdir
*fqdir
,
298 struct inet_frag_queue
**prev
)
300 struct inet_frags
*f
= fqdir
->f
;
301 struct inet_frag_queue
*q
;
303 q
= inet_frag_alloc(fqdir
, f
, arg
);
305 *prev
= ERR_PTR(-ENOMEM
);
308 mod_timer(&q
->timer
, jiffies
+ fqdir
->timeout
);
310 *prev
= rhashtable_lookup_get_insert_key(&fqdir
->rhashtable
, &q
->key
,
311 &q
->node
, f
->rhash_params
);
313 q
->flags
|= INET_FRAG_COMPLETE
;
315 inet_frag_destroy(q
);
321 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
322 struct inet_frag_queue
*inet_frag_find(struct fqdir
*fqdir
, void *key
)
324 struct inet_frag_queue
*fq
= NULL
, *prev
;
326 if (!fqdir
->high_thresh
|| frag_mem_limit(fqdir
) > fqdir
->high_thresh
)
331 prev
= rhashtable_lookup(&fqdir
->rhashtable
, key
, fqdir
->f
->rhash_params
);
333 fq
= inet_frag_create(fqdir
, key
, &prev
);
334 if (!IS_ERR_OR_NULL(prev
)) {
336 if (!refcount_inc_not_zero(&fq
->refcnt
))
342 EXPORT_SYMBOL(inet_frag_find
);
344 int inet_frag_queue_insert(struct inet_frag_queue
*q
, struct sk_buff
*skb
,
347 struct sk_buff
*last
= q
->fragments_tail
;
349 /* RFC5722, Section 4, amended by Errata ID : 3089
350 * When reassembling an IPv6 datagram, if
351 * one or more its constituent fragments is determined to be an
352 * overlapping fragment, the entire datagram (and any constituent
353 * fragments) MUST be silently discarded.
355 * Duplicates, however, should be ignored (i.e. skb dropped, but the
356 * queue/fragments kept for later reassembly).
359 fragrun_create(q
, skb
); /* First fragment. */
360 else if (last
->ip_defrag_offset
+ last
->len
< end
) {
361 /* This is the common case: skb goes to the end. */
362 /* Detect and discard overlaps. */
363 if (offset
< last
->ip_defrag_offset
+ last
->len
)
364 return IPFRAG_OVERLAP
;
365 if (offset
== last
->ip_defrag_offset
+ last
->len
)
366 fragrun_append_to_last(q
, skb
);
368 fragrun_create(q
, skb
);
370 /* Binary search. Note that skb can become the first fragment,
371 * but not the last (covered above).
373 struct rb_node
**rbn
, *parent
;
375 rbn
= &q
->rb_fragments
.rb_node
;
377 struct sk_buff
*curr
;
381 curr
= rb_to_skb(parent
);
382 curr_run_end
= curr
->ip_defrag_offset
+
383 FRAG_CB(curr
)->frag_run_len
;
384 if (end
<= curr
->ip_defrag_offset
)
385 rbn
= &parent
->rb_left
;
386 else if (offset
>= curr_run_end
)
387 rbn
= &parent
->rb_right
;
388 else if (offset
>= curr
->ip_defrag_offset
&&
392 return IPFRAG_OVERLAP
;
394 /* Here we have parent properly set, and rbn pointing to
395 * one of its NULL left/right children. Insert skb.
398 rb_link_node(&skb
->rbnode
, parent
, rbn
);
399 rb_insert_color(&skb
->rbnode
, &q
->rb_fragments
);
402 skb
->ip_defrag_offset
= offset
;
406 EXPORT_SYMBOL(inet_frag_queue_insert
);
408 void *inet_frag_reasm_prepare(struct inet_frag_queue
*q
, struct sk_buff
*skb
,
409 struct sk_buff
*parent
)
411 struct sk_buff
*fp
, *head
= skb_rb_first(&q
->rb_fragments
);
412 struct sk_buff
**nextp
;
416 fp
= skb_clone(skb
, GFP_ATOMIC
);
419 FRAG_CB(fp
)->next_frag
= FRAG_CB(skb
)->next_frag
;
420 if (RB_EMPTY_NODE(&skb
->rbnode
))
421 FRAG_CB(parent
)->next_frag
= fp
;
423 rb_replace_node(&skb
->rbnode
, &fp
->rbnode
,
425 if (q
->fragments_tail
== skb
)
426 q
->fragments_tail
= fp
;
427 skb_morph(skb
, head
);
428 FRAG_CB(skb
)->next_frag
= FRAG_CB(head
)->next_frag
;
429 rb_replace_node(&head
->rbnode
, &skb
->rbnode
,
434 WARN_ON(head
->ip_defrag_offset
!= 0);
436 delta
= -head
->truesize
;
438 /* Head of list must not be cloned. */
439 if (skb_unclone(head
, GFP_ATOMIC
))
442 delta
+= head
->truesize
;
444 add_frag_mem_limit(q
->fqdir
, delta
);
446 /* If the first fragment is fragmented itself, we split
447 * it to two chunks: the first with data and paged part
448 * and the second, holding only fragments.
450 if (skb_has_frag_list(head
)) {
451 struct sk_buff
*clone
;
454 clone
= alloc_skb(0, GFP_ATOMIC
);
457 skb_shinfo(clone
)->frag_list
= skb_shinfo(head
)->frag_list
;
458 skb_frag_list_init(head
);
459 for (i
= 0; i
< skb_shinfo(head
)->nr_frags
; i
++)
460 plen
+= skb_frag_size(&skb_shinfo(head
)->frags
[i
]);
461 clone
->data_len
= head
->data_len
- plen
;
462 clone
->len
= clone
->data_len
;
463 head
->truesize
+= clone
->truesize
;
465 clone
->ip_summed
= head
->ip_summed
;
466 add_frag_mem_limit(q
->fqdir
, clone
->truesize
);
467 skb_shinfo(head
)->frag_list
= clone
;
468 nextp
= &clone
->next
;
470 nextp
= &skb_shinfo(head
)->frag_list
;
475 EXPORT_SYMBOL(inet_frag_reasm_prepare
);
477 void inet_frag_reasm_finish(struct inet_frag_queue
*q
, struct sk_buff
*head
,
478 void *reasm_data
, bool try_coalesce
)
480 struct sk_buff
**nextp
= (struct sk_buff
**)reasm_data
;
485 skb_push(head
, head
->data
- skb_network_header(head
));
487 /* Traverse the tree in order, to build frag_list. */
488 fp
= FRAG_CB(head
)->next_frag
;
489 rbn
= rb_next(&head
->rbnode
);
490 rb_erase(&head
->rbnode
, &q
->rb_fragments
);
492 sum_truesize
= head
->truesize
;
494 /* fp points to the next sk_buff in the current run;
495 * rbn points to the next run.
497 /* Go through the current run. */
499 struct sk_buff
*next_frag
= FRAG_CB(fp
)->next_frag
;
503 sum_truesize
+= fp
->truesize
;
504 if (head
->ip_summed
!= fp
->ip_summed
)
505 head
->ip_summed
= CHECKSUM_NONE
;
506 else if (head
->ip_summed
== CHECKSUM_COMPLETE
)
507 head
->csum
= csum_add(head
->csum
, fp
->csum
);
509 if (try_coalesce
&& skb_try_coalesce(head
, fp
, &stolen
,
511 kfree_skb_partial(fp
, stolen
);
514 memset(&fp
->rbnode
, 0, sizeof(fp
->rbnode
));
517 head
->data_len
+= fp
->len
;
518 head
->len
+= fp
->len
;
519 head
->truesize
+= fp
->truesize
;
527 /* Move to the next run. */
529 struct rb_node
*rbnext
= rb_next(rbn
);
532 rb_erase(rbn
, &q
->rb_fragments
);
536 sub_frag_mem_limit(q
->fqdir
, sum_truesize
);
539 skb_mark_not_on_list(head
);
541 head
->tstamp
= q
->stamp
;
543 EXPORT_SYMBOL(inet_frag_reasm_finish
);
545 struct sk_buff
*inet_frag_pull_head(struct inet_frag_queue
*q
)
547 struct sk_buff
*head
, *skb
;
549 head
= skb_rb_first(&q
->rb_fragments
);
552 skb
= FRAG_CB(head
)->next_frag
;
554 rb_replace_node(&head
->rbnode
, &skb
->rbnode
,
557 rb_erase(&head
->rbnode
, &q
->rb_fragments
);
558 memset(&head
->rbnode
, 0, sizeof(head
->rbnode
));
561 if (head
== q
->fragments_tail
)
562 q
->fragments_tail
= NULL
;
564 sub_frag_mem_limit(q
->fqdir
, head
->truesize
);
568 EXPORT_SYMBOL(inet_frag_pull_head
);