1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2016 Facebook
4 #include <linux/cpumask.h>
5 #include <linux/spinlock.h>
6 #include <linux/percpu.h>
8 #include "bpf_lru_list.h"
10 #define LOCAL_FREE_TARGET (128)
11 #define LOCAL_NR_SCANS LOCAL_FREE_TARGET
13 #define PERCPU_FREE_TARGET (4)
14 #define PERCPU_NR_SCANS PERCPU_FREE_TARGET
16 /* Helpers to get the local list index */
17 #define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET)
18 #define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
19 #define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
20 #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
22 static int get_next_cpu(int cpu
)
24 cpu
= cpumask_next(cpu
, cpu_possible_mask
);
25 if (cpu
>= nr_cpu_ids
)
26 cpu
= cpumask_first(cpu_possible_mask
);
30 /* Local list helpers */
31 static struct list_head
*local_free_list(struct bpf_lru_locallist
*loc_l
)
33 return &loc_l
->lists
[LOCAL_FREE_LIST_IDX
];
36 static struct list_head
*local_pending_list(struct bpf_lru_locallist
*loc_l
)
38 return &loc_l
->lists
[LOCAL_PENDING_LIST_IDX
];
41 /* bpf_lru_node helpers */
42 static bool bpf_lru_node_is_ref(const struct bpf_lru_node
*node
)
44 return READ_ONCE(node
->ref
);
47 static void bpf_lru_node_clear_ref(struct bpf_lru_node
*node
)
49 WRITE_ONCE(node
->ref
, 0);
52 static void bpf_lru_list_count_inc(struct bpf_lru_list
*l
,
53 enum bpf_lru_list_type type
)
55 if (type
< NR_BPF_LRU_LIST_COUNT
)
59 static void bpf_lru_list_count_dec(struct bpf_lru_list
*l
,
60 enum bpf_lru_list_type type
)
62 if (type
< NR_BPF_LRU_LIST_COUNT
)
66 static void __bpf_lru_node_move_to_free(struct bpf_lru_list
*l
,
67 struct bpf_lru_node
*node
,
68 struct list_head
*free_list
,
69 enum bpf_lru_list_type tgt_free_type
)
71 if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node
->type
)))
74 /* If the removing node is the next_inactive_rotation candidate,
75 * move the next_inactive_rotation pointer also.
77 if (&node
->list
== l
->next_inactive_rotation
)
78 l
->next_inactive_rotation
= l
->next_inactive_rotation
->prev
;
80 bpf_lru_list_count_dec(l
, node
->type
);
82 node
->type
= tgt_free_type
;
83 list_move(&node
->list
, free_list
);
86 /* Move nodes from local list to the LRU list */
87 static void __bpf_lru_node_move_in(struct bpf_lru_list
*l
,
88 struct bpf_lru_node
*node
,
89 enum bpf_lru_list_type tgt_type
)
91 if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node
->type
)) ||
92 WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type
)))
95 bpf_lru_list_count_inc(l
, tgt_type
);
96 node
->type
= tgt_type
;
97 bpf_lru_node_clear_ref(node
);
98 list_move(&node
->list
, &l
->lists
[tgt_type
]);
101 /* Move nodes between or within active and inactive list (like
102 * active to inactive, inactive to active or tail of active back to
103 * the head of active).
105 static void __bpf_lru_node_move(struct bpf_lru_list
*l
,
106 struct bpf_lru_node
*node
,
107 enum bpf_lru_list_type tgt_type
)
109 if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node
->type
)) ||
110 WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type
)))
113 if (node
->type
!= tgt_type
) {
114 bpf_lru_list_count_dec(l
, node
->type
);
115 bpf_lru_list_count_inc(l
, tgt_type
);
116 node
->type
= tgt_type
;
118 bpf_lru_node_clear_ref(node
);
120 /* If the moving node is the next_inactive_rotation candidate,
121 * move the next_inactive_rotation pointer also.
123 if (&node
->list
== l
->next_inactive_rotation
)
124 l
->next_inactive_rotation
= l
->next_inactive_rotation
->prev
;
126 list_move(&node
->list
, &l
->lists
[tgt_type
]);
129 static bool bpf_lru_list_inactive_low(const struct bpf_lru_list
*l
)
131 return l
->counts
[BPF_LRU_LIST_T_INACTIVE
] <
132 l
->counts
[BPF_LRU_LIST_T_ACTIVE
];
135 /* Rotate the active list:
137 * 2. If the node has the ref bit set, it will be rotated
138 * back to the head of active list with the ref bit cleared.
139 * Give this node one more chance to survive in the active list.
140 * 3. If the ref bit is not set, move it to the head of the
142 * 4. It will at most scan nr_scans nodes
144 static void __bpf_lru_list_rotate_active(struct bpf_lru
*lru
,
145 struct bpf_lru_list
*l
)
147 struct list_head
*active
= &l
->lists
[BPF_LRU_LIST_T_ACTIVE
];
148 struct bpf_lru_node
*node
, *tmp_node
, *first_node
;
151 first_node
= list_first_entry(active
, struct bpf_lru_node
, list
);
152 list_for_each_entry_safe_reverse(node
, tmp_node
, active
, list
) {
153 if (bpf_lru_node_is_ref(node
))
154 __bpf_lru_node_move(l
, node
, BPF_LRU_LIST_T_ACTIVE
);
156 __bpf_lru_node_move(l
, node
, BPF_LRU_LIST_T_INACTIVE
);
158 if (++i
== lru
->nr_scans
|| node
== first_node
)
163 /* Rotate the inactive list. It starts from the next_inactive_rotation
164 * 1. If the node has ref bit set, it will be moved to the head
165 * of active list with the ref bit cleared.
166 * 2. If the node does not have ref bit set, it will leave it
167 * at its current location (i.e. do nothing) so that it can
168 * be considered during the next inactive_shrink.
169 * 3. It will at most scan nr_scans nodes
171 static void __bpf_lru_list_rotate_inactive(struct bpf_lru
*lru
,
172 struct bpf_lru_list
*l
)
174 struct list_head
*inactive
= &l
->lists
[BPF_LRU_LIST_T_INACTIVE
];
175 struct list_head
*cur
, *last
, *next
= inactive
;
176 struct bpf_lru_node
*node
;
179 if (list_empty(inactive
))
182 last
= l
->next_inactive_rotation
->next
;
183 if (last
== inactive
)
186 cur
= l
->next_inactive_rotation
;
187 while (i
< lru
->nr_scans
) {
188 if (cur
== inactive
) {
193 node
= list_entry(cur
, struct bpf_lru_node
, list
);
195 if (bpf_lru_node_is_ref(node
))
196 __bpf_lru_node_move(l
, node
, BPF_LRU_LIST_T_ACTIVE
);
203 l
->next_inactive_rotation
= next
;
206 /* Shrink the inactive list. It starts from the tail of the
207 * inactive list and only move the nodes without the ref bit
208 * set to the designated free list.
211 __bpf_lru_list_shrink_inactive(struct bpf_lru
*lru
,
212 struct bpf_lru_list
*l
,
213 unsigned int tgt_nshrink
,
214 struct list_head
*free_list
,
215 enum bpf_lru_list_type tgt_free_type
)
217 struct list_head
*inactive
= &l
->lists
[BPF_LRU_LIST_T_INACTIVE
];
218 struct bpf_lru_node
*node
, *tmp_node
;
219 unsigned int nshrinked
= 0;
222 list_for_each_entry_safe_reverse(node
, tmp_node
, inactive
, list
) {
223 if (bpf_lru_node_is_ref(node
)) {
224 __bpf_lru_node_move(l
, node
, BPF_LRU_LIST_T_ACTIVE
);
225 } else if (lru
->del_from_htab(lru
->del_arg
, node
)) {
226 __bpf_lru_node_move_to_free(l
, node
, free_list
,
228 if (++nshrinked
== tgt_nshrink
)
232 if (++i
== lru
->nr_scans
)
239 /* 1. Rotate the active list (if needed)
240 * 2. Always rotate the inactive list
242 static void __bpf_lru_list_rotate(struct bpf_lru
*lru
, struct bpf_lru_list
*l
)
244 if (bpf_lru_list_inactive_low(l
))
245 __bpf_lru_list_rotate_active(lru
, l
);
247 __bpf_lru_list_rotate_inactive(lru
, l
);
250 /* Calls __bpf_lru_list_shrink_inactive() to shrink some
251 * ref-bit-cleared nodes and move them to the designated
254 * If it cannot get a free node after calling
255 * __bpf_lru_list_shrink_inactive(). It will just remove
256 * one node from either inactive or active list without
257 * honoring the ref-bit. It prefers inactive list to active
258 * list in this situation.
260 static unsigned int __bpf_lru_list_shrink(struct bpf_lru
*lru
,
261 struct bpf_lru_list
*l
,
262 unsigned int tgt_nshrink
,
263 struct list_head
*free_list
,
264 enum bpf_lru_list_type tgt_free_type
)
267 struct bpf_lru_node
*node
, *tmp_node
;
268 struct list_head
*force_shrink_list
;
269 unsigned int nshrinked
;
271 nshrinked
= __bpf_lru_list_shrink_inactive(lru
, l
, tgt_nshrink
,
272 free_list
, tgt_free_type
);
276 /* Do a force shrink by ignoring the reference bit */
277 if (!list_empty(&l
->lists
[BPF_LRU_LIST_T_INACTIVE
]))
278 force_shrink_list
= &l
->lists
[BPF_LRU_LIST_T_INACTIVE
];
280 force_shrink_list
= &l
->lists
[BPF_LRU_LIST_T_ACTIVE
];
282 list_for_each_entry_safe_reverse(node
, tmp_node
, force_shrink_list
,
284 if (lru
->del_from_htab(lru
->del_arg
, node
)) {
285 __bpf_lru_node_move_to_free(l
, node
, free_list
,
294 /* Flush the nodes from the local pending list to the LRU list */
295 static void __local_list_flush(struct bpf_lru_list
*l
,
296 struct bpf_lru_locallist
*loc_l
)
298 struct bpf_lru_node
*node
, *tmp_node
;
300 list_for_each_entry_safe_reverse(node
, tmp_node
,
301 local_pending_list(loc_l
), list
) {
302 if (bpf_lru_node_is_ref(node
))
303 __bpf_lru_node_move_in(l
, node
, BPF_LRU_LIST_T_ACTIVE
);
305 __bpf_lru_node_move_in(l
, node
,
306 BPF_LRU_LIST_T_INACTIVE
);
310 static void bpf_lru_list_push_free(struct bpf_lru_list
*l
,
311 struct bpf_lru_node
*node
)
315 if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node
->type
)))
318 raw_spin_lock_irqsave(&l
->lock
, flags
);
319 __bpf_lru_node_move(l
, node
, BPF_LRU_LIST_T_FREE
);
320 raw_spin_unlock_irqrestore(&l
->lock
, flags
);
323 static void bpf_lru_list_pop_free_to_local(struct bpf_lru
*lru
,
324 struct bpf_lru_locallist
*loc_l
)
326 struct bpf_lru_list
*l
= &lru
->common_lru
.lru_list
;
327 struct bpf_lru_node
*node
, *tmp_node
;
328 unsigned int nfree
= 0;
330 raw_spin_lock(&l
->lock
);
332 __local_list_flush(l
, loc_l
);
334 __bpf_lru_list_rotate(lru
, l
);
336 list_for_each_entry_safe(node
, tmp_node
, &l
->lists
[BPF_LRU_LIST_T_FREE
],
338 __bpf_lru_node_move_to_free(l
, node
, local_free_list(loc_l
),
339 BPF_LRU_LOCAL_LIST_T_FREE
);
340 if (++nfree
== LOCAL_FREE_TARGET
)
344 if (nfree
< LOCAL_FREE_TARGET
)
345 __bpf_lru_list_shrink(lru
, l
, LOCAL_FREE_TARGET
- nfree
,
346 local_free_list(loc_l
),
347 BPF_LRU_LOCAL_LIST_T_FREE
);
349 raw_spin_unlock(&l
->lock
);
352 static void __local_list_add_pending(struct bpf_lru
*lru
,
353 struct bpf_lru_locallist
*loc_l
,
355 struct bpf_lru_node
*node
,
358 *(u32
*)((void *)node
+ lru
->hash_offset
) = hash
;
360 node
->type
= BPF_LRU_LOCAL_LIST_T_PENDING
;
361 bpf_lru_node_clear_ref(node
);
362 list_add(&node
->list
, local_pending_list(loc_l
));
365 static struct bpf_lru_node
*
366 __local_list_pop_free(struct bpf_lru_locallist
*loc_l
)
368 struct bpf_lru_node
*node
;
370 node
= list_first_entry_or_null(local_free_list(loc_l
),
374 list_del(&node
->list
);
379 static struct bpf_lru_node
*
380 __local_list_pop_pending(struct bpf_lru
*lru
, struct bpf_lru_locallist
*loc_l
)
382 struct bpf_lru_node
*node
;
386 /* Get from the tail (i.e. older element) of the pending list. */
387 list_for_each_entry_reverse(node
, local_pending_list(loc_l
),
389 if ((!bpf_lru_node_is_ref(node
) || force
) &&
390 lru
->del_from_htab(lru
->del_arg
, node
)) {
391 list_del(&node
->list
);
404 static struct bpf_lru_node
*bpf_percpu_lru_pop_free(struct bpf_lru
*lru
,
407 struct list_head
*free_list
;
408 struct bpf_lru_node
*node
= NULL
;
409 struct bpf_lru_list
*l
;
411 int cpu
= raw_smp_processor_id();
413 l
= per_cpu_ptr(lru
->percpu_lru
, cpu
);
415 raw_spin_lock_irqsave(&l
->lock
, flags
);
417 __bpf_lru_list_rotate(lru
, l
);
419 free_list
= &l
->lists
[BPF_LRU_LIST_T_FREE
];
420 if (list_empty(free_list
))
421 __bpf_lru_list_shrink(lru
, l
, PERCPU_FREE_TARGET
, free_list
,
422 BPF_LRU_LIST_T_FREE
);
424 if (!list_empty(free_list
)) {
425 node
= list_first_entry(free_list
, struct bpf_lru_node
, list
);
426 *(u32
*)((void *)node
+ lru
->hash_offset
) = hash
;
427 bpf_lru_node_clear_ref(node
);
428 __bpf_lru_node_move(l
, node
, BPF_LRU_LIST_T_INACTIVE
);
431 raw_spin_unlock_irqrestore(&l
->lock
, flags
);
436 static struct bpf_lru_node
*bpf_common_lru_pop_free(struct bpf_lru
*lru
,
439 struct bpf_lru_locallist
*loc_l
, *steal_loc_l
;
440 struct bpf_common_lru
*clru
= &lru
->common_lru
;
441 struct bpf_lru_node
*node
;
442 int steal
, first_steal
;
444 int cpu
= raw_smp_processor_id();
446 loc_l
= per_cpu_ptr(clru
->local_list
, cpu
);
448 raw_spin_lock_irqsave(&loc_l
->lock
, flags
);
450 node
= __local_list_pop_free(loc_l
);
452 bpf_lru_list_pop_free_to_local(lru
, loc_l
);
453 node
= __local_list_pop_free(loc_l
);
457 __local_list_add_pending(lru
, loc_l
, cpu
, node
, hash
);
459 raw_spin_unlock_irqrestore(&loc_l
->lock
, flags
);
464 /* No free nodes found from the local free list and
465 * the global LRU list.
467 * Steal from the local free/pending list of the
468 * current CPU and remote CPU in RR. It starts
469 * with the loc_l->next_steal CPU.
472 first_steal
= loc_l
->next_steal
;
475 steal_loc_l
= per_cpu_ptr(clru
->local_list
, steal
);
477 raw_spin_lock_irqsave(&steal_loc_l
->lock
, flags
);
479 node
= __local_list_pop_free(steal_loc_l
);
481 node
= __local_list_pop_pending(lru
, steal_loc_l
);
483 raw_spin_unlock_irqrestore(&steal_loc_l
->lock
, flags
);
485 steal
= get_next_cpu(steal
);
486 } while (!node
&& steal
!= first_steal
);
488 loc_l
->next_steal
= steal
;
491 raw_spin_lock_irqsave(&loc_l
->lock
, flags
);
492 __local_list_add_pending(lru
, loc_l
, cpu
, node
, hash
);
493 raw_spin_unlock_irqrestore(&loc_l
->lock
, flags
);
499 struct bpf_lru_node
*bpf_lru_pop_free(struct bpf_lru
*lru
, u32 hash
)
502 return bpf_percpu_lru_pop_free(lru
, hash
);
504 return bpf_common_lru_pop_free(lru
, hash
);
507 static void bpf_common_lru_push_free(struct bpf_lru
*lru
,
508 struct bpf_lru_node
*node
)
510 u8 node_type
= READ_ONCE(node
->type
);
513 if (WARN_ON_ONCE(node_type
== BPF_LRU_LIST_T_FREE
) ||
514 WARN_ON_ONCE(node_type
== BPF_LRU_LOCAL_LIST_T_FREE
))
517 if (node_type
== BPF_LRU_LOCAL_LIST_T_PENDING
) {
518 struct bpf_lru_locallist
*loc_l
;
520 loc_l
= per_cpu_ptr(lru
->common_lru
.local_list
, node
->cpu
);
522 raw_spin_lock_irqsave(&loc_l
->lock
, flags
);
524 if (unlikely(node
->type
!= BPF_LRU_LOCAL_LIST_T_PENDING
)) {
525 raw_spin_unlock_irqrestore(&loc_l
->lock
, flags
);
529 node
->type
= BPF_LRU_LOCAL_LIST_T_FREE
;
530 bpf_lru_node_clear_ref(node
);
531 list_move(&node
->list
, local_free_list(loc_l
));
533 raw_spin_unlock_irqrestore(&loc_l
->lock
, flags
);
538 bpf_lru_list_push_free(&lru
->common_lru
.lru_list
, node
);
541 static void bpf_percpu_lru_push_free(struct bpf_lru
*lru
,
542 struct bpf_lru_node
*node
)
544 struct bpf_lru_list
*l
;
547 l
= per_cpu_ptr(lru
->percpu_lru
, node
->cpu
);
549 raw_spin_lock_irqsave(&l
->lock
, flags
);
551 __bpf_lru_node_move(l
, node
, BPF_LRU_LIST_T_FREE
);
553 raw_spin_unlock_irqrestore(&l
->lock
, flags
);
556 void bpf_lru_push_free(struct bpf_lru
*lru
, struct bpf_lru_node
*node
)
559 bpf_percpu_lru_push_free(lru
, node
);
561 bpf_common_lru_push_free(lru
, node
);
564 static void bpf_common_lru_populate(struct bpf_lru
*lru
, void *buf
,
565 u32 node_offset
, u32 elem_size
,
568 struct bpf_lru_list
*l
= &lru
->common_lru
.lru_list
;
571 for (i
= 0; i
< nr_elems
; i
++) {
572 struct bpf_lru_node
*node
;
574 node
= (struct bpf_lru_node
*)(buf
+ node_offset
);
575 node
->type
= BPF_LRU_LIST_T_FREE
;
576 bpf_lru_node_clear_ref(node
);
577 list_add(&node
->list
, &l
->lists
[BPF_LRU_LIST_T_FREE
]);
582 static void bpf_percpu_lru_populate(struct bpf_lru
*lru
, void *buf
,
583 u32 node_offset
, u32 elem_size
,
588 struct bpf_lru_list
*l
;
590 pcpu_entries
= nr_elems
/ num_possible_cpus();
594 for_each_possible_cpu(cpu
) {
595 struct bpf_lru_node
*node
;
597 l
= per_cpu_ptr(lru
->percpu_lru
, cpu
);
599 node
= (struct bpf_lru_node
*)(buf
+ node_offset
);
601 node
->type
= BPF_LRU_LIST_T_FREE
;
602 bpf_lru_node_clear_ref(node
);
603 list_add(&node
->list
, &l
->lists
[BPF_LRU_LIST_T_FREE
]);
608 if (i
% pcpu_entries
)
613 void bpf_lru_populate(struct bpf_lru
*lru
, void *buf
, u32 node_offset
,
614 u32 elem_size
, u32 nr_elems
)
617 bpf_percpu_lru_populate(lru
, buf
, node_offset
, elem_size
,
620 bpf_common_lru_populate(lru
, buf
, node_offset
, elem_size
,
624 static void bpf_lru_locallist_init(struct bpf_lru_locallist
*loc_l
, int cpu
)
628 for (i
= 0; i
< NR_BPF_LRU_LOCAL_LIST_T
; i
++)
629 INIT_LIST_HEAD(&loc_l
->lists
[i
]);
631 loc_l
->next_steal
= cpu
;
633 raw_spin_lock_init(&loc_l
->lock
);
636 static void bpf_lru_list_init(struct bpf_lru_list
*l
)
640 for (i
= 0; i
< NR_BPF_LRU_LIST_T
; i
++)
641 INIT_LIST_HEAD(&l
->lists
[i
]);
643 for (i
= 0; i
< NR_BPF_LRU_LIST_COUNT
; i
++)
646 l
->next_inactive_rotation
= &l
->lists
[BPF_LRU_LIST_T_INACTIVE
];
648 raw_spin_lock_init(&l
->lock
);
651 int bpf_lru_init(struct bpf_lru
*lru
, bool percpu
, u32 hash_offset
,
652 del_from_htab_func del_from_htab
, void *del_arg
)
657 lru
->percpu_lru
= alloc_percpu(struct bpf_lru_list
);
658 if (!lru
->percpu_lru
)
661 for_each_possible_cpu(cpu
) {
662 struct bpf_lru_list
*l
;
664 l
= per_cpu_ptr(lru
->percpu_lru
, cpu
);
665 bpf_lru_list_init(l
);
667 lru
->nr_scans
= PERCPU_NR_SCANS
;
669 struct bpf_common_lru
*clru
= &lru
->common_lru
;
671 clru
->local_list
= alloc_percpu(struct bpf_lru_locallist
);
672 if (!clru
->local_list
)
675 for_each_possible_cpu(cpu
) {
676 struct bpf_lru_locallist
*loc_l
;
678 loc_l
= per_cpu_ptr(clru
->local_list
, cpu
);
679 bpf_lru_locallist_init(loc_l
, cpu
);
682 bpf_lru_list_init(&clru
->lru_list
);
683 lru
->nr_scans
= LOCAL_NR_SCANS
;
686 lru
->percpu
= percpu
;
687 lru
->del_from_htab
= del_from_htab
;
688 lru
->del_arg
= del_arg
;
689 lru
->hash_offset
= hash_offset
;
694 void bpf_lru_destroy(struct bpf_lru
*lru
)
697 free_percpu(lru
->percpu_lru
);
699 free_percpu(lru
->common_lru
.local_list
);