kernel/sched/ext.c

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
   4  *
   5  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
   6  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
   7  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
   8  */
   9 #define SCX_OP_IDX(op)          (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
  10
  11 enum scx_consts {
  12         SCX_DSP_DFL_MAX_BATCH           = 32,
  13         SCX_DSP_MAX_LOOPS               = 32,
  14         SCX_WATCHDOG_MAX_TIMEOUT        = 30 * HZ,
  15
  16         SCX_EXIT_BT_LEN                 = 64,
  17         SCX_EXIT_MSG_LEN                = 1024,
  18         SCX_EXIT_DUMP_DFL_LEN           = 32768,
  19
  20         SCX_CPUPERF_ONE                 = SCHED_CAPACITY_SCALE,
  21
  22         /*
  23          * Iterating all tasks may take a while. Periodically drop
  24          * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
  25          */
  26         SCX_OPS_TASK_ITER_BATCH         = 32,
  27 };
  28
  29 enum scx_exit_kind {
  30         SCX_EXIT_NONE,
  31         SCX_EXIT_DONE,
  32
  33         SCX_EXIT_UNREG = 64,    /* user-space initiated unregistration */
  34         SCX_EXIT_UNREG_BPF,     /* BPF-initiated unregistration */
  35         SCX_EXIT_UNREG_KERN,    /* kernel-initiated unregistration */
  36         SCX_EXIT_SYSRQ,         /* requested by 'S' sysrq */
  37
  38         SCX_EXIT_ERROR = 1024,  /* runtime error, error msg contains details */
  39         SCX_EXIT_ERROR_BPF,     /* ERROR but triggered through scx_bpf_error() */
  40         SCX_EXIT_ERROR_STALL,   /* watchdog detected stalled runnable tasks */
  41 };
  42
  43 /*
  44  * An exit code can be specified when exiting with scx_bpf_exit() or
  45  * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
  46  * respectively. The codes are 64bit of the format:
  47  *
  48  *   Bits: [63  ..  48 47   ..  32 31 .. 0]
  49  *         [ SYS ACT ] [ SYS RSN ] [ USR  ]
  50  *
  51  *   SYS ACT: System-defined exit actions
  52  *   SYS RSN: System-defined exit reasons
  53  *   USR    : User-defined exit codes and reasons
  54  *
  55  * Using the above, users may communicate intention and context by ORing system
  56  * actions and/or system reasons with a user-defined exit code.
  57  */
  58 enum scx_exit_code {
  59         /* Reasons */
  60         SCX_ECODE_RSN_HOTPLUG   = 1LLU << 32,
  61
  62         /* Actions */
  63         SCX_ECODE_ACT_RESTART   = 1LLU << 48,
  64 };
  65
  66 /*
  67  * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
  68  * being disabled.
  69  */
  70 struct scx_exit_info {
  71         /* %SCX_EXIT_* - broad category of the exit reason */
  72         enum scx_exit_kind      kind;
  73
  74         /* exit code if gracefully exiting */
  75         s64                     exit_code;
  76
  77         /* textual representation of the above */
  78         const char              *reason;
  79
  80         /* backtrace if exiting due to an error */
  81         unsigned long           *bt;
  82         u32                     bt_len;
  83
  84         /* informational message */
  85         char                    *msg;
  86
  87         /* debug dump */
  88         char                    *dump;
  89 };
  90
  91 /* sched_ext_ops.flags */
  92 enum scx_ops_flags {
  93         /*
  94          * Keep built-in idle tracking even if ops.update_idle() is implemented.
  95          */
  96         SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
  97
  98         /*
  99          * By default, if there are no other task to run on the CPU, ext core
 100          * keeps running the current task even after its slice expires. If this
 101          * flag is specified, such tasks are passed to ops.enqueue() with
 102          * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
 103          */
 104         SCX_OPS_ENQ_LAST        = 1LLU << 1,
 105
 106         /*
 107          * An exiting task may schedule after PF_EXITING is set. In such cases,
 108          * bpf_task_from_pid() may not be able to find the task and if the BPF
 109          * scheduler depends on pid lookup for dispatching, the task will be
 110          * lost leading to various issues including RCU grace period stalls.
 111          *
 112          * To mask this problem, by default, unhashed tasks are automatically
 113          * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
 114          * depend on pid lookups and wants to handle these tasks directly, the
 115          * following flag can be used.
 116          */
 117         SCX_OPS_ENQ_EXITING     = 1LLU << 2,
 118
 119         /*
 120          * If set, only tasks with policy set to SCHED_EXT are attached to
 121          * sched_ext. If clear, SCHED_NORMAL tasks are also included.
 122          */
 123         SCX_OPS_SWITCH_PARTIAL  = 1LLU << 3,
 124
 125         /*
 126          * CPU cgroup support flags
 127          */
 128         SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */
 129
 130         SCX_OPS_ALL_FLAGS       = SCX_OPS_KEEP_BUILTIN_IDLE |
 131                                   SCX_OPS_ENQ_LAST |
 132                                   SCX_OPS_ENQ_EXITING |
 133                                   SCX_OPS_SWITCH_PARTIAL |
 134                                   SCX_OPS_HAS_CGROUP_WEIGHT,
 135 };
 136
 137 /* argument container for ops.init_task() */
 138 struct scx_init_task_args {
 139         /*
 140          * Set if ops.init_task() is being invoked on the fork path, as opposed
 141          * to the scheduler transition path.
 142          */
 143         bool                    fork;
 144 #ifdef CONFIG_EXT_GROUP_SCHED
 145         /* the cgroup the task is joining */
 146         struct cgroup           *cgroup;
 147 #endif
 148 };
 149
 150 /* argument container for ops.exit_task() */
 151 struct scx_exit_task_args {
 152         /* Whether the task exited before running on sched_ext. */
 153         bool cancelled;
 154 };
 155
 156 /* argument container for ops->cgroup_init() */
 157 struct scx_cgroup_init_args {
 158         /* the weight of the cgroup [1..10000] */
 159         u32                     weight;
 160 };
 161
 162 enum scx_cpu_preempt_reason {
 163         /* next task is being scheduled by &sched_class_rt */
 164         SCX_CPU_PREEMPT_RT,
 165         /* next task is being scheduled by &sched_class_dl */
 166         SCX_CPU_PREEMPT_DL,
 167         /* next task is being scheduled by &sched_class_stop */
 168         SCX_CPU_PREEMPT_STOP,
 169         /* unknown reason for SCX being preempted */
 170         SCX_CPU_PREEMPT_UNKNOWN,
 171 };
 172
 173 /*
 174  * Argument container for ops->cpu_acquire(). Currently empty, but may be
 175  * expanded in the future.
 176  */
 177 struct scx_cpu_acquire_args {};
 178
 179 /* argument container for ops->cpu_release() */
 180 struct scx_cpu_release_args {
 181         /* the reason the CPU was preempted */
 182         enum scx_cpu_preempt_reason reason;
 183
 184         /* the task that's going to be scheduled on the CPU */
 185         struct task_struct      *task;
 186 };
 187
 188 /*
 189  * Informational context provided to dump operations.
 190  */
 191 struct scx_dump_ctx {
 192         enum scx_exit_kind      kind;
 193         s64                     exit_code;
 194         const char              *reason;
 195         u64                     at_ns;
 196         u64                     at_jiffies;
 197 };
 198
 199 /**
 200  * struct sched_ext_ops - Operation table for BPF scheduler implementation
 201  *
 202  * A BPF scheduler can implement an arbitrary scheduling policy by
 203  * implementing and loading operations in this table. Note that a userland
 204  * scheduling policy can also be implemented using the BPF scheduler
 205  * as a shim layer.
 206  */
 207 struct sched_ext_ops {
 208         /**
 209          * select_cpu - Pick the target CPU for a task which is being woken up
 210          * @p: task being woken up
 211          * @prev_cpu: the cpu @p was on before sleeping
 212          * @wake_flags: SCX_WAKE_*
 213          *
 214          * Decision made here isn't final. @p may be moved to any CPU while it
 215          * is getting dispatched for execution later. However, as @p is not on
 216          * the rq at this point, getting the eventual execution CPU right here
 217          * saves a small bit of overhead down the line.
 218          *
 219          * If an idle CPU is returned, the CPU is kicked and will try to
 220          * dispatch. While an explicit custom mechanism can be added,
 221          * select_cpu() serves as the default way to wake up idle CPUs.
 222          *
 223          * @p may be inserted into a DSQ directly by calling
 224          * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
 225          * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
 226          * of the CPU returned by this operation.
 227          *
 228          * Note that select_cpu() is never called for tasks that can only run
 229          * on a single CPU or tasks with migration disabled, as they don't have
 230          * the option to select a different CPU. See select_task_rq() for
 231          * details.
 232          */
 233         s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
 234
 235         /**
 236          * enqueue - Enqueue a task on the BPF scheduler
 237          * @p: task being enqueued
 238          * @enq_flags: %SCX_ENQ_*
 239          *
 240          * @p is ready to run. Insert directly into a DSQ by calling
 241          * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
 242          * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
 243          * the task will stall.
 244          *
 245          * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
 246          * skipped.
 247          */
 248         void (*enqueue)(struct task_struct *p, u64 enq_flags);
 249
 250         /**
 251          * dequeue - Remove a task from the BPF scheduler
 252          * @p: task being dequeued
 253          * @deq_flags: %SCX_DEQ_*
 254          *
 255          * Remove @p from the BPF scheduler. This is usually called to isolate
 256          * the task while updating its scheduling properties (e.g. priority).
 257          *
 258          * The ext core keeps track of whether the BPF side owns a given task or
 259          * not and can gracefully ignore spurious dispatches from BPF side,
 260          * which makes it safe to not implement this method. However, depending
 261          * on the scheduling logic, this can lead to confusing behaviors - e.g.
 262          * scheduling position not being updated across a priority change.
 263          */
 264         void (*dequeue)(struct task_struct *p, u64 deq_flags);
 265
 266         /**
 267          * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs
 268          * @cpu: CPU to dispatch tasks for
 269          * @prev: previous task being switched out
 270          *
 271          * Called when a CPU's local dsq is empty. The operation should dispatch
 272          * one or more tasks from the BPF scheduler into the DSQs using
 273          * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
 274          * using scx_bpf_dsq_move_to_local().
 275          *
 276          * The maximum number of times scx_bpf_dsq_insert() can be called
 277          * without an intervening scx_bpf_dsq_move_to_local() is specified by
 278          * ops.dispatch_max_batch. See the comments on top of the two functions
 279          * for more details.
 280          *
 281          * When not %NULL, @prev is an SCX task with its slice depleted. If
 282          * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
 283          * @prev->scx.flags, it is not enqueued yet and will be enqueued after
 284          * ops.dispatch() returns. To keep executing @prev, return without
 285          * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
 286          */
 287         void (*dispatch)(s32 cpu, struct task_struct *prev);
 288
 289         /**
 290          * tick - Periodic tick
 291          * @p: task running currently
 292          *
 293          * This operation is called every 1/HZ seconds on CPUs which are
 294          * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
 295          * immediate dispatch cycle on the CPU.
 296          */
 297         void (*tick)(struct task_struct *p);
 298
 299         /**
 300          * runnable - A task is becoming runnable on its associated CPU
 301          * @p: task becoming runnable
 302          * @enq_flags: %SCX_ENQ_*
 303          *
 304          * This and the following three functions can be used to track a task's
 305          * execution state transitions. A task becomes ->runnable() on a CPU,
 306          * and then goes through one or more ->running() and ->stopping() pairs
 307          * as it runs on the CPU, and eventually becomes ->quiescent() when it's
 308          * done running on the CPU.
 309          *
 310          * @p is becoming runnable on the CPU because it's
 311          *
 312          * - waking up (%SCX_ENQ_WAKEUP)
 313          * - being moved from another CPU
 314          * - being restored after temporarily taken off the queue for an
 315          *   attribute change.
 316          *
 317          * This and ->enqueue() are related but not coupled. This operation
 318          * notifies @p's state transition and may not be followed by ->enqueue()
 319          * e.g. when @p is being dispatched to a remote CPU, or when @p is
 320          * being enqueued on a CPU experiencing a hotplug event. Likewise, a
 321          * task may be ->enqueue()'d without being preceded by this operation
 322          * e.g. after exhausting its slice.
 323          */
 324         void (*runnable)(struct task_struct *p, u64 enq_flags);
 325
 326         /**
 327          * running - A task is starting to run on its associated CPU
 328          * @p: task starting to run
 329          *
 330          * See ->runnable() for explanation on the task state notifiers.
 331          */
 332         void (*running)(struct task_struct *p);
 333
 334         /**
 335          * stopping - A task is stopping execution
 336          * @p: task stopping to run
 337          * @runnable: is task @p still runnable?
 338          *
 339          * See ->runnable() for explanation on the task state notifiers. If
 340          * !@runnable, ->quiescent() will be invoked after this operation
 341          * returns.
 342          */
 343         void (*stopping)(struct task_struct *p, bool runnable);
 344
 345         /**
 346          * quiescent - A task is becoming not runnable on its associated CPU
 347          * @p: task becoming not runnable
 348          * @deq_flags: %SCX_DEQ_*
 349          *
 350          * See ->runnable() for explanation on the task state notifiers.
 351          *
 352          * @p is becoming quiescent on the CPU because it's
 353          *
 354          * - sleeping (%SCX_DEQ_SLEEP)
 355          * - being moved to another CPU
 356          * - being temporarily taken off the queue for an attribute change
 357          *   (%SCX_DEQ_SAVE)
 358          *
 359          * This and ->dequeue() are related but not coupled. This operation
 360          * notifies @p's state transition and may not be preceded by ->dequeue()
 361          * e.g. when @p is being dispatched to a remote CPU.
 362          */
 363         void (*quiescent)(struct task_struct *p, u64 deq_flags);
 364
 365         /**
 366          * yield - Yield CPU
 367          * @from: yielding task
 368          * @to: optional yield target task
 369          *
 370          * If @to is NULL, @from is yielding the CPU to other runnable tasks.
 371          * The BPF scheduler should ensure that other available tasks are
 372          * dispatched before the yielding task. Return value is ignored in this
 373          * case.
 374          *
 375          * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
 376          * scheduler can implement the request, return %true; otherwise, %false.
 377          */
 378         bool (*yield)(struct task_struct *from, struct task_struct *to);
 379
 380         /**
 381          * core_sched_before - Task ordering for core-sched
 382          * @a: task A
 383          * @b: task B
 384          *
 385          * Used by core-sched to determine the ordering between two tasks. See
 386          * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
 387          * core-sched.
 388          *
 389          * Both @a and @b are runnable and may or may not currently be queued on
 390          * the BPF scheduler. Should return %true if @a should run before @b.
 391          * %false if there's no required ordering or @b should run before @a.
 392          *
 393          * If not specified, the default is ordering them according to when they
 394          * became runnable.
 395          */
 396         bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
 397
 398         /**
 399          * set_weight - Set task weight
 400          * @p: task to set weight for
 401          * @weight: new weight [1..10000]
 402          *
 403          * Update @p's weight to @weight.
 404          */
 405         void (*set_weight)(struct task_struct *p, u32 weight);
 406
 407         /**
 408          * set_cpumask - Set CPU affinity
 409          * @p: task to set CPU affinity for
 410          * @cpumask: cpumask of cpus that @p can run on
 411          *
 412          * Update @p's CPU affinity to @cpumask.
 413          */
 414         void (*set_cpumask)(struct task_struct *p,
 415                             const struct cpumask *cpumask);
 416
 417         /**
 418          * update_idle - Update the idle state of a CPU
 419          * @cpu: CPU to udpate the idle state for
 420          * @idle: whether entering or exiting the idle state
 421          *
 422          * This operation is called when @rq's CPU goes or leaves the idle
 423          * state. By default, implementing this operation disables the built-in
 424          * idle CPU tracking and the following helpers become unavailable:
 425          *
 426          * - scx_bpf_select_cpu_dfl()
 427          * - scx_bpf_test_and_clear_cpu_idle()
 428          * - scx_bpf_pick_idle_cpu()
 429          *
 430          * The user also must implement ops.select_cpu() as the default
 431          * implementation relies on scx_bpf_select_cpu_dfl().
 432          *
 433          * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
 434          * tracking.
 435          */
 436         void (*update_idle)(s32 cpu, bool idle);
 437
 438         /**
 439          * cpu_acquire - A CPU is becoming available to the BPF scheduler
 440          * @cpu: The CPU being acquired by the BPF scheduler.
 441          * @args: Acquire arguments, see the struct definition.
 442          *
 443          * A CPU that was previously released from the BPF scheduler is now once
 444          * again under its control.
 445          */
 446         void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
 447
 448         /**
 449          * cpu_release - A CPU is taken away from the BPF scheduler
 450          * @cpu: The CPU being released by the BPF scheduler.
 451          * @args: Release arguments, see the struct definition.
 452          *
 453          * The specified CPU is no longer under the control of the BPF
 454          * scheduler. This could be because it was preempted by a higher
 455          * priority sched_class, though there may be other reasons as well. The
 456          * caller should consult @args->reason to determine the cause.
 457          */
 458         void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
 459
 460         /**
 461          * init_task - Initialize a task to run in a BPF scheduler
 462          * @p: task to initialize for BPF scheduling
 463          * @args: init arguments, see the struct definition
 464          *
 465          * Either we're loading a BPF scheduler or a new task is being forked.
 466          * Initialize @p for BPF scheduling. This operation may block and can
 467          * be used for allocations, and is called exactly once for a task.
 468          *
 469          * Return 0 for success, -errno for failure. An error return while
 470          * loading will abort loading of the BPF scheduler. During a fork, it
 471          * will abort that specific fork.
 472          */
 473         s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
 474
 475         /**
 476          * exit_task - Exit a previously-running task from the system
 477          * @p: task to exit
 478          *
 479          * @p is exiting or the BPF scheduler is being unloaded. Perform any
 480          * necessary cleanup for @p.
 481          */
 482         void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
 483
 484         /**
 485          * enable - Enable BPF scheduling for a task
 486          * @p: task to enable BPF scheduling for
 487          *
 488          * Enable @p for BPF scheduling. enable() is called on @p any time it
 489          * enters SCX, and is always paired with a matching disable().
 490          */
 491         void (*enable)(struct task_struct *p);
 492
 493         /**
 494          * disable - Disable BPF scheduling for a task
 495          * @p: task to disable BPF scheduling for
 496          *
 497          * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
 498          * Disable BPF scheduling for @p. A disable() call is always matched
 499          * with a prior enable() call.
 500          */
 501         void (*disable)(struct task_struct *p);
 502
 503         /**
 504          * dump - Dump BPF scheduler state on error
 505          * @ctx: debug dump context
 506          *
 507          * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
 508          */
 509         void (*dump)(struct scx_dump_ctx *ctx);
 510
 511         /**
 512          * dump_cpu - Dump BPF scheduler state for a CPU on error
 513          * @ctx: debug dump context
 514          * @cpu: CPU to generate debug dump for
 515          * @idle: @cpu is currently idle without any runnable tasks
 516          *
 517          * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
 518          * @cpu. If @idle is %true and this operation doesn't produce any
 519          * output, @cpu is skipped for dump.
 520          */
 521         void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
 522
 523         /**
 524          * dump_task - Dump BPF scheduler state for a runnable task on error
 525          * @ctx: debug dump context
 526          * @p: runnable task to generate debug dump for
 527          *
 528          * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
 529          * @p.
 530          */
 531         void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
 532
 533 #ifdef CONFIG_EXT_GROUP_SCHED
 534         /**
 535          * cgroup_init - Initialize a cgroup
 536          * @cgrp: cgroup being initialized
 537          * @args: init arguments, see the struct definition
 538          *
 539          * Either the BPF scheduler is being loaded or @cgrp created, initialize
 540          * @cgrp for sched_ext. This operation may block.
 541          *
 542          * Return 0 for success, -errno for failure. An error return while
 543          * loading will abort loading of the BPF scheduler. During cgroup
 544          * creation, it will abort the specific cgroup creation.
 545          */
 546         s32 (*cgroup_init)(struct cgroup *cgrp,
 547                            struct scx_cgroup_init_args *args);
 548
 549         /**
 550          * cgroup_exit - Exit a cgroup
 551          * @cgrp: cgroup being exited
 552          *
 553          * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
 554          * @cgrp for sched_ext. This operation my block.
 555          */
 556         void (*cgroup_exit)(struct cgroup *cgrp);
 557
 558         /**
 559          * cgroup_prep_move - Prepare a task to be moved to a different cgroup
 560          * @p: task being moved
 561          * @from: cgroup @p is being moved from
 562          * @to: cgroup @p is being moved to
 563          *
 564          * Prepare @p for move from cgroup @from to @to. This operation may
 565          * block and can be used for allocations.
 566          *
 567          * Return 0 for success, -errno for failure. An error return aborts the
 568          * migration.
 569          */
 570         s32 (*cgroup_prep_move)(struct task_struct *p,
 571                                 struct cgroup *from, struct cgroup *to);
 572
 573         /**
 574          * cgroup_move - Commit cgroup move
 575          * @p: task being moved
 576          * @from: cgroup @p is being moved from
 577          * @to: cgroup @p is being moved to
 578          *
 579          * Commit the move. @p is dequeued during this operation.
 580          */
 581         void (*cgroup_move)(struct task_struct *p,
 582                             struct cgroup *from, struct cgroup *to);
 583
 584         /**
 585          * cgroup_cancel_move - Cancel cgroup move
 586          * @p: task whose cgroup move is being canceled
 587          * @from: cgroup @p was being moved from
 588          * @to: cgroup @p was being moved to
 589          *
 590          * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
 591          * Undo the preparation.
 592          */
 593         void (*cgroup_cancel_move)(struct task_struct *p,
 594                                    struct cgroup *from, struct cgroup *to);
 595
 596         /**
 597          * cgroup_set_weight - A cgroup's weight is being changed
 598          * @cgrp: cgroup whose weight is being updated
 599          * @weight: new weight [1..10000]
 600          *
 601          * Update @tg's weight to @weight.
 602          */
 603         void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
 604 #endif  /* CONFIG_EXT_GROUP_SCHED */
 605
 606         /*
 607          * All online ops must come before ops.cpu_online().
 608          */
 609
 610         /**
 611          * cpu_online - A CPU became online
 612          * @cpu: CPU which just came up
 613          *
 614          * @cpu just came online. @cpu will not call ops.enqueue() or
 615          * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
 616          */
 617         void (*cpu_online)(s32 cpu);
 618
 619         /**
 620          * cpu_offline - A CPU is going offline
 621          * @cpu: CPU which is going offline
 622          *
 623          * @cpu is going offline. @cpu will not call ops.enqueue() or
 624          * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
 625          */
 626         void (*cpu_offline)(s32 cpu);
 627
 628         /*
 629          * All CPU hotplug ops must come before ops.init().
 630          */
 631
 632         /**
 633          * init - Initialize the BPF scheduler
 634          */
 635         s32 (*init)(void);
 636
 637         /**
 638          * exit - Clean up after the BPF scheduler
 639          * @info: Exit info
 640          *
 641          * ops.exit() is also called on ops.init() failure, which is a bit
 642          * unusual. This is to allow rich reporting through @info on how
 643          * ops.init() failed.
 644          */
 645         void (*exit)(struct scx_exit_info *info);
 646
 647         /**
 648          * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
 649          */
 650         u32 dispatch_max_batch;
 651
 652         /**
 653          * flags - %SCX_OPS_* flags
 654          */
 655         u64 flags;
 656
 657         /**
 658          * timeout_ms - The maximum amount of time, in milliseconds, that a
 659          * runnable task should be able to wait before being scheduled. The
 660          * maximum timeout may not exceed the default timeout of 30 seconds.
 661          *
 662          * Defaults to the maximum allowed timeout value of 30 seconds.
 663          */
 664         u32 timeout_ms;
 665
 666         /**
 667          * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
 668          * value of 32768 is used.
 669          */
 670         u32 exit_dump_len;
 671
 672         /**
 673          * hotplug_seq - A sequence number that may be set by the scheduler to
 674          * detect when a hotplug event has occurred during the loading process.
 675          * If 0, no detection occurs. Otherwise, the scheduler will fail to
 676          * load if the sequence number does not match @scx_hotplug_seq on the
 677          * enable path.
 678          */
 679         u64 hotplug_seq;
 680
 681         /**
 682          * name - BPF scheduler's name
 683          *
 684          * Must be a non-zero valid BPF object name including only isalnum(),
 685          * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
 686          * BPF scheduler is enabled.
 687          */
 688         char name[SCX_OPS_NAME_LEN];
 689 };
 690
 691 enum scx_opi {
 692         SCX_OPI_BEGIN                   = 0,
 693         SCX_OPI_NORMAL_BEGIN            = 0,
 694         SCX_OPI_NORMAL_END              = SCX_OP_IDX(cpu_online),
 695         SCX_OPI_CPU_HOTPLUG_BEGIN       = SCX_OP_IDX(cpu_online),
 696         SCX_OPI_CPU_HOTPLUG_END         = SCX_OP_IDX(init),
 697         SCX_OPI_END                     = SCX_OP_IDX(init),
 698 };
 699
 700 enum scx_wake_flags {
 701         /* expose select WF_* flags as enums */
 702         SCX_WAKE_FORK           = WF_FORK,
 703         SCX_WAKE_TTWU           = WF_TTWU,
 704         SCX_WAKE_SYNC           = WF_SYNC,
 705 };
 706
 707 enum scx_enq_flags {
 708         /* expose select ENQUEUE_* flags as enums */
 709         SCX_ENQ_WAKEUP          = ENQUEUE_WAKEUP,
 710         SCX_ENQ_HEAD            = ENQUEUE_HEAD,
 711         SCX_ENQ_CPU_SELECTED    = ENQUEUE_RQ_SELECTED,
 712
 713         /* high 32bits are SCX specific */
 714
 715         /*
 716          * Set the following to trigger preemption when calling
 717          * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
 718          * current task is cleared to zero and the CPU is kicked into the
 719          * scheduling path. Implies %SCX_ENQ_HEAD.
 720          */
 721         SCX_ENQ_PREEMPT         = 1LLU << 32,
 722
 723         /*
 724          * The task being enqueued was previously enqueued on the current CPU's
 725          * %SCX_DSQ_LOCAL, but was removed from it in a call to the
 726          * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
 727          * invoked in a ->cpu_release() callback, and the task is again
 728          * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
 729          * task will not be scheduled on the CPU until at least the next invocation
 730          * of the ->cpu_acquire() callback.
 731          */
 732         SCX_ENQ_REENQ           = 1LLU << 40,
 733
 734         /*
 735          * The task being enqueued is the only task available for the cpu. By
 736          * default, ext core keeps executing such tasks but when
 737          * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
 738          * %SCX_ENQ_LAST flag set.
 739          *
 740          * The BPF scheduler is responsible for triggering a follow-up
 741          * scheduling event. Otherwise, Execution may stall.
 742          */
 743         SCX_ENQ_LAST            = 1LLU << 41,
 744
 745         /* high 8 bits are internal */
 746         __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
 747
 748         SCX_ENQ_CLEAR_OPSS      = 1LLU << 56,
 749         SCX_ENQ_DSQ_PRIQ        = 1LLU << 57,
 750 };
 751
 752 enum scx_deq_flags {
 753         /* expose select DEQUEUE_* flags as enums */
 754         SCX_DEQ_SLEEP           = DEQUEUE_SLEEP,
 755
 756         /* high 32bits are SCX specific */
 757
 758         /*
 759          * The generic core-sched layer decided to execute the task even though
 760          * it hasn't been dispatched yet. Dequeue from the BPF side.
 761          */
 762         SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
 763 };
 764
 765 enum scx_pick_idle_cpu_flags {
 766         SCX_PICK_IDLE_CORE      = 1LLU << 0,    /* pick a CPU whose SMT siblings are also idle */
 767 };
 768
 769 enum scx_kick_flags {
 770         /*
 771          * Kick the target CPU if idle. Guarantees that the target CPU goes
 772          * through at least one full scheduling cycle before going idle. If the
 773          * target CPU can be determined to be currently not idle and going to go
 774          * through a scheduling cycle before going idle, noop.
 775          */
 776         SCX_KICK_IDLE           = 1LLU << 0,
 777
 778         /*
 779          * Preempt the current task and execute the dispatch path. If the
 780          * current task of the target CPU is an SCX task, its ->scx.slice is
 781          * cleared to zero before the scheduling path is invoked so that the
 782          * task expires and the dispatch path is invoked.
 783          */
 784         SCX_KICK_PREEMPT        = 1LLU << 1,
 785
 786         /*
 787          * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
 788          * return after the target CPU finishes picking the next task.
 789          */
 790         SCX_KICK_WAIT           = 1LLU << 2,
 791 };
 792
 793 enum scx_tg_flags {
 794         SCX_TG_ONLINE           = 1U << 0,
 795         SCX_TG_INITED           = 1U << 1,
 796 };
 797
 798 enum scx_ops_enable_state {
 799         SCX_OPS_ENABLING,
 800         SCX_OPS_ENABLED,
 801         SCX_OPS_DISABLING,
 802         SCX_OPS_DISABLED,
 803 };
 804
 805 static const char *scx_ops_enable_state_str[] = {
 806         [SCX_OPS_ENABLING]      = "enabling",
 807         [SCX_OPS_ENABLED]       = "enabled",
 808         [SCX_OPS_DISABLING]     = "disabling",
 809         [SCX_OPS_DISABLED]      = "disabled",
 810 };
 811
 812 /*
 813  * sched_ext_entity->ops_state
 814  *
 815  * Used to track the task ownership between the SCX core and the BPF scheduler.
 816  * State transitions look as follows:
 817  *
 818  * NONE -> QUEUEING -> QUEUED -> DISPATCHING
 819  *   ^              |                 |
 820  *   |              v                 v
 821  *   \-------------------------------/
 822  *
 823  * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
 824  * sites for explanations on the conditions being waited upon and why they are
 825  * safe. Transitions out of them into NONE or QUEUED must store_release and the
 826  * waiters should load_acquire.
 827  *
 828  * Tracking scx_ops_state enables sched_ext core to reliably determine whether
 829  * any given task can be dispatched by the BPF scheduler at all times and thus
 830  * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
 831  * to try to dispatch any task anytime regardless of its state as the SCX core
 832  * can safely reject invalid dispatches.
 833  */
 834 enum scx_ops_state {
 835         SCX_OPSS_NONE,          /* owned by the SCX core */
 836         SCX_OPSS_QUEUEING,      /* in transit to the BPF scheduler */
 837         SCX_OPSS_QUEUED,        /* owned by the BPF scheduler */
 838         SCX_OPSS_DISPATCHING,   /* in transit back to the SCX core */
 839
 840         /*
 841          * QSEQ brands each QUEUED instance so that, when dispatch races
 842          * dequeue/requeue, the dispatcher can tell whether it still has a claim
 843          * on the task being dispatched.
 844          *
 845          * As some 32bit archs can't do 64bit store_release/load_acquire,
 846          * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
 847          * 32bit machines. The dispatch race window QSEQ protects is very narrow
 848          * and runs with IRQ disabled. 30 bits should be sufficient.
 849          */
 850         SCX_OPSS_QSEQ_SHIFT     = 2,
 851 };
 852
 853 /* Use macros to ensure that the type is unsigned long for the masks */
 854 #define SCX_OPSS_STATE_MASK     ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
 855 #define SCX_OPSS_QSEQ_MASK      (~SCX_OPSS_STATE_MASK)
 856
 857 /*
 858  * During exit, a task may schedule after losing its PIDs. When disabling the
 859  * BPF scheduler, we need to be able to iterate tasks in every state to
 860  * guarantee system safety. Maintain a dedicated task list which contains every
 861  * task between its fork and eventual free.
 862  */
 863 static DEFINE_SPINLOCK(scx_tasks_lock);
 864 static LIST_HEAD(scx_tasks);
 865
 866 /* ops enable/disable */
 867 static struct kthread_worker *scx_ops_helper;
 868 static DEFINE_MUTEX(scx_ops_enable_mutex);
 869 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 870 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 871 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
 872 static unsigned long scx_in_softlockup;
 873 static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
 874 static int scx_ops_bypass_depth;
 875 static bool scx_ops_init_task_enabled;
 876 static bool scx_switching_all;
 877 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
 878
 879 static struct sched_ext_ops scx_ops;
 880 static bool scx_warned_zero_slice;
 881
 882 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
 883 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
 884 static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
 885 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
 886
 887 #ifdef CONFIG_SMP
 888 static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
 889 static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
 890 #endif
 891
 892 static struct static_key_false scx_has_op[SCX_OPI_END] =
 893         { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
 894
 895 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
 896 static struct scx_exit_info *scx_exit_info;
 897
 898 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
 899 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
 900
 901 /*
 902  * A monotically increasing sequence number that is incremented every time a
 903  * scheduler is enabled. This can be used by to check if any custom sched_ext
 904  * scheduler has ever been used in the system.
 905  */
 906 static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
 907
 908 /*
 909  * The maximum amount of time in jiffies that a task may be runnable without
 910  * being scheduled on a CPU. If this timeout is exceeded, it will trigger
 911  * scx_ops_error().
 912  */
 913 static unsigned long scx_watchdog_timeout;
 914
 915 /*
 916  * The last time the delayed work was run. This delayed work relies on
 917  * ksoftirqd being able to run to service timer interrupts, so it's possible
 918  * that this work itself could get wedged. To account for this, we check that
 919  * it's not stalled in the timer tick, and trigger an error if it is.
 920  */
 921 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
 922
 923 static struct delayed_work scx_watchdog_work;
 924
 925 /* idle tracking */
 926 #ifdef CONFIG_SMP
 927 #ifdef CONFIG_CPUMASK_OFFSTACK
 928 #define CL_ALIGNED_IF_ONSTACK
 929 #else
 930 #define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
 931 #endif
 932
 933 static struct {
 934         cpumask_var_t cpu;
 935         cpumask_var_t smt;
 936 } idle_masks CL_ALIGNED_IF_ONSTACK;
 937
 938 #endif  /* CONFIG_SMP */
 939
 940 /* for %SCX_KICK_WAIT */
 941 static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
 942
 943 /*
 944  * Direct dispatch marker.
 945  *
 946  * Non-NULL values are used for direct dispatch from enqueue path. A valid
 947  * pointer points to the task currently being enqueued. An ERR_PTR value is used
 948  * to indicate that direct dispatch has already happened.
 949  */
 950 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
 951
 952 /*
 953  * Dispatch queues.
 954  *
 955  * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
 956  * to avoid live-locking in bypass mode where all tasks are dispatched to
 957  * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
 958  * sufficient, it can be further split.
 959  */
 960 static struct scx_dispatch_q **global_dsqs;
 961
 962 static const struct rhashtable_params dsq_hash_params = {
 963         .key_len                = 8,
 964         .key_offset             = offsetof(struct scx_dispatch_q, id),
 965         .head_offset            = offsetof(struct scx_dispatch_q, hash_node),
 966 };
 967
 968 static struct rhashtable dsq_hash;
 969 static LLIST_HEAD(dsqs_to_free);
 970
 971 /* dispatch buf */
 972 struct scx_dsp_buf_ent {
 973         struct task_struct      *task;
 974         unsigned long           qseq;
 975         u64                     dsq_id;
 976         u64                     enq_flags;
 977 };
 978
 979 static u32 scx_dsp_max_batch;
 980
 981 struct scx_dsp_ctx {
 982         struct rq               *rq;
 983         u32                     cursor;
 984         u32                     nr_tasks;
 985         struct scx_dsp_buf_ent  buf[];
 986 };
 987
 988 static struct scx_dsp_ctx __percpu *scx_dsp_ctx;
 989
 990 /* string formatting from BPF */
 991 struct scx_bstr_buf {
 992         u64                     data[MAX_BPRINTF_VARARGS];
 993         char                    line[SCX_EXIT_MSG_LEN];
 994 };
 995
 996 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
 997 static struct scx_bstr_buf scx_exit_bstr_buf;
 998
 999 /* ops debug dump */
1000 struct scx_dump_data {
1001         s32                     cpu;
1002         bool                    first;
1003         s32                     cursor;
1004         struct seq_buf          *s;
1005         const char              *prefix;
1006         struct scx_bstr_buf     buf;
1007 };
1008
1009 static struct scx_dump_data scx_dump_data = {
1010         .cpu                    = -1,
1011 };
1012
1013 /* /sys/kernel/sched_ext interface */
1014 static struct kset *scx_kset;
1015 static struct kobject *scx_root_kobj;
1016
1017 #define CREATE_TRACE_POINTS
1018 #include <trace/events/sched_ext.h>
1019
1020 static void process_ddsp_deferred_locals(struct rq *rq);
1021 static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
1022 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
1023                                              s64 exit_code,
1024                                              const char *fmt, ...);
1025
1026 #define scx_ops_error_kind(err, fmt, args...)                                   \
1027         scx_ops_exit_kind((err), 0, fmt, ##args)
1028
1029 #define scx_ops_exit(code, fmt, args...)                                        \
1030         scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
1031
1032 #define scx_ops_error(fmt, args...)                                             \
1033         scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
1034
1035 #define SCX_HAS_OP(op)  static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
1036
1037 static long jiffies_delta_msecs(unsigned long at, unsigned long now)
1038 {
1039         if (time_after(at, now))
1040                 return jiffies_to_msecs(at - now);
1041         else
1042                 return -(long)jiffies_to_msecs(now - at);
1043 }
1044
1045 /* if the highest set bit is N, return a mask with bits [N+1, 31] set */
1046 static u32 higher_bits(u32 flags)
1047 {
1048         return ~((1 << fls(flags)) - 1);
1049 }
1050
1051 /* return the mask with only the highest bit set */
1052 static u32 highest_bit(u32 flags)
1053 {
1054         int bit = fls(flags);
1055         return ((u64)1 << bit) >> 1;
1056 }
1057
1058 static bool u32_before(u32 a, u32 b)
1059 {
1060         return (s32)(a - b) < 0;
1061 }
1062
1063 static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
1064 {
1065         return global_dsqs[cpu_to_node(task_cpu(p))];
1066 }
1067
1068 static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
1069 {
1070         return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
1071 }
1072
1073 /*
1074  * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
1075  * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
1076  * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
1077  * whether it's running from an allowed context.
1078  *
1079  * @mask is constant, always inline to cull the mask calculations.
1080  */
1081 static __always_inline void scx_kf_allow(u32 mask)
1082 {
1083         /* nesting is allowed only in increasing scx_kf_mask order */
1084         WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
1085                   "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
1086                   current->scx.kf_mask, mask);
1087         current->scx.kf_mask |= mask;
1088         barrier();
1089 }
1090
1091 static void scx_kf_disallow(u32 mask)
1092 {
1093         barrier();
1094         current->scx.kf_mask &= ~mask;
1095 }
1096
1097 #define SCX_CALL_OP(mask, op, args...)                                          \
1098 do {                                                                            \
1099         if (mask) {                                                             \
1100                 scx_kf_allow(mask);                                             \
1101                 scx_ops.op(args);                                               \
1102                 scx_kf_disallow(mask);                                          \
1103         } else {                                                                \
1104                 scx_ops.op(args);                                               \
1105         }                                                                       \
1106 } while (0)
1107
1108 #define SCX_CALL_OP_RET(mask, op, args...)                                      \
1109 ({                                                                              \
1110         __typeof__(scx_ops.op(args)) __ret;                                     \
1111         if (mask) {                                                             \
1112                 scx_kf_allow(mask);                                             \
1113                 __ret = scx_ops.op(args);                                       \
1114                 scx_kf_disallow(mask);                                          \
1115         } else {                                                                \
1116                 __ret = scx_ops.op(args);                                       \
1117         }                                                                       \
1118         __ret;                                                                  \
1119 })
1120
1121 /*
1122  * Some kfuncs are allowed only on the tasks that are subjects of the
1123  * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
1124  * restrictions, the following SCX_CALL_OP_*() variants should be used when
1125  * invoking scx_ops operations that take task arguments. These can only be used
1126  * for non-nesting operations due to the way the tasks are tracked.
1127  *
1128  * kfuncs which can only operate on such tasks can in turn use
1129  * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
1130  * the specific task.
1131  */
1132 #define SCX_CALL_OP_TASK(mask, op, task, args...)                               \
1133 do {                                                                            \
1134         BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);                              \
1135         current->scx.kf_tasks[0] = task;                                        \
1136         SCX_CALL_OP(mask, op, task, ##args);                                    \
1137         current->scx.kf_tasks[0] = NULL;                                        \
1138 } while (0)
1139
1140 #define SCX_CALL_OP_TASK_RET(mask, op, task, args...)                           \
1141 ({                                                                              \
1142         __typeof__(scx_ops.op(task, ##args)) __ret;                             \
1143         BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);                              \
1144         current->scx.kf_tasks[0] = task;                                        \
1145         __ret = SCX_CALL_OP_RET(mask, op, task, ##args);                        \
1146         current->scx.kf_tasks[0] = NULL;                                        \
1147         __ret;                                                                  \
1148 })
1149
1150 #define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...)                 \
1151 ({                                                                              \
1152         __typeof__(scx_ops.op(task0, task1, ##args)) __ret;                     \
1153         BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);                              \
1154         current->scx.kf_tasks[0] = task0;                                       \
1155         current->scx.kf_tasks[1] = task1;                                       \
1156         __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args);                \
1157         current->scx.kf_tasks[0] = NULL;                                        \
1158         current->scx.kf_tasks[1] = NULL;                                        \
1159         __ret;                                                                  \
1160 })
1161
1162 /* @mask is constant, always inline to cull unnecessary branches */
1163 static __always_inline bool scx_kf_allowed(u32 mask)
1164 {
1165         if (unlikely(!(current->scx.kf_mask & mask))) {
1166                 scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
1167                               mask, current->scx.kf_mask);
1168                 return false;
1169         }
1170
1171         /*
1172          * Enforce nesting boundaries. e.g. A kfunc which can be called from
1173          * DISPATCH must not be called if we're running DEQUEUE which is nested
1174          * inside ops.dispatch(). We don't need to check boundaries for any
1175          * blocking kfuncs as the verifier ensures they're only called from
1176          * sleepable progs.
1177          */
1178         if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
1179                      (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
1180                 scx_ops_error("cpu_release kfunc called from a nested operation");
1181                 return false;
1182         }
1183
1184         if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
1185                      (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
1186                 scx_ops_error("dispatch kfunc called from a nested operation");
1187                 return false;
1188         }
1189
1190         return true;
1191 }
1192
1193 /* see SCX_CALL_OP_TASK() */
1194 static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
1195                                                         struct task_struct *p)
1196 {
1197         if (!scx_kf_allowed(mask))
1198                 return false;
1199
1200         if (unlikely((p != current->scx.kf_tasks[0] &&
1201                       p != current->scx.kf_tasks[1]))) {
1202                 scx_ops_error("called on a task not being operated on");
1203                 return false;
1204         }
1205
1206         return true;
1207 }
1208
1209 static bool scx_kf_allowed_if_unlocked(void)
1210 {
1211         return !current->scx.kf_mask;
1212 }
1213
1214 /**
1215  * nldsq_next_task - Iterate to the next task in a non-local DSQ
1216  * @dsq: user dsq being interated
1217  * @cur: current position, %NULL to start iteration
1218  * @rev: walk backwards
1219  *
1220  * Returns %NULL when iteration is finished.
1221  */
1222 static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
1223                                            struct task_struct *cur, bool rev)
1224 {
1225         struct list_head *list_node;
1226         struct scx_dsq_list_node *dsq_lnode;
1227
1228         lockdep_assert_held(&dsq->lock);
1229
1230         if (cur)
1231                 list_node = &cur->scx.dsq_list.node;
1232         else
1233                 list_node = &dsq->list;
1234
1235         /* find the next task, need to skip BPF iteration cursors */
1236         do {
1237                 if (rev)
1238                         list_node = list_node->prev;
1239                 else
1240                         list_node = list_node->next;
1241
1242                 if (list_node == &dsq->list)
1243                         return NULL;
1244
1245                 dsq_lnode = container_of(list_node, struct scx_dsq_list_node,
1246                                          node);
1247         } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR);
1248
1249         return container_of(dsq_lnode, struct task_struct, scx.dsq_list);
1250 }
1251
1252 #define nldsq_for_each_task(p, dsq)                                             \
1253         for ((p) = nldsq_next_task((dsq), NULL, false); (p);                    \
1254              (p) = nldsq_next_task((dsq), (p), false))
1255
1256
1257 /*
1258  * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
1259  * dispatch order. BPF-visible iterator is opaque and larger to allow future
1260  * changes without breaking backward compatibility. Can be used with
1261  * bpf_for_each(). See bpf_iter_scx_dsq_*().
1262  */
1263 enum scx_dsq_iter_flags {
1264         /* iterate in the reverse dispatch order */
1265         SCX_DSQ_ITER_REV                = 1U << 16,
1266
1267         __SCX_DSQ_ITER_HAS_SLICE        = 1U << 30,
1268         __SCX_DSQ_ITER_HAS_VTIME        = 1U << 31,
1269
1270         __SCX_DSQ_ITER_USER_FLAGS       = SCX_DSQ_ITER_REV,
1271         __SCX_DSQ_ITER_ALL_FLAGS        = __SCX_DSQ_ITER_USER_FLAGS |
1272                                           __SCX_DSQ_ITER_HAS_SLICE |
1273                                           __SCX_DSQ_ITER_HAS_VTIME,
1274 };
1275
1276 struct bpf_iter_scx_dsq_kern {
1277         struct scx_dsq_list_node        cursor;
1278         struct scx_dispatch_q           *dsq;
1279         u64                             slice;
1280         u64                             vtime;
1281 } __attribute__((aligned(8)));
1282
1283 struct bpf_iter_scx_dsq {
1284         u64                             __opaque[6];
1285 } __attribute__((aligned(8)));
1286
1287
1288 /*
1289  * SCX task iterator.
1290  */
1291 struct scx_task_iter {
1292         struct sched_ext_entity         cursor;
1293         struct task_struct              *locked;
1294         struct rq                       *rq;
1295         struct rq_flags                 rf;
1296         u32                             cnt;
1297 };
1298
1299 /**
1300  * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
1301  * @iter: iterator to init
1302  *
1303  * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter
1304  * must eventually be stopped with scx_task_iter_stop().
1305  *
1306  * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock()
1307  * between this and the first next() call or between any two next() calls. If
1308  * the locks are released between two next() calls, the caller is responsible
1309  * for ensuring that the task being iterated remains accessible either through
1310  * RCU read lock or obtaining a reference count.
1311  *
1312  * All tasks which existed when the iteration started are guaranteed to be
1313  * visited as long as they still exist.
1314  */
1315 static void scx_task_iter_start(struct scx_task_iter *iter)
1316 {
1317         BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
1318                      ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
1319
1320         spin_lock_irq(&scx_tasks_lock);
1321
1322         iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
1323         list_add(&iter->cursor.tasks_node, &scx_tasks);
1324         iter->locked = NULL;
1325         iter->cnt = 0;
1326 }
1327
1328 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
1329 {
1330         if (iter->locked) {
1331                 task_rq_unlock(iter->rq, iter->locked, &iter->rf);
1332                 iter->locked = NULL;
1333         }
1334 }
1335
1336 /**
1337  * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
1338  * @iter: iterator to unlock
1339  *
1340  * If @iter is in the middle of a locked iteration, it may be locking the rq of
1341  * the task currently being visited in addition to scx_tasks_lock. Unlock both.
1342  * This function can be safely called anytime during an iteration.
1343  */
1344 static void scx_task_iter_unlock(struct scx_task_iter *iter)
1345 {
1346         __scx_task_iter_rq_unlock(iter);
1347         spin_unlock_irq(&scx_tasks_lock);
1348 }
1349
1350 /**
1351  * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
1352  * @iter: iterator to re-lock
1353  *
1354  * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
1355  * doesn't re-lock the rq lock. Must be called before other iterator operations.
1356  */
1357 static void scx_task_iter_relock(struct scx_task_iter *iter)
1358 {
1359         spin_lock_irq(&scx_tasks_lock);
1360 }
1361
1362 /**
1363  * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
1364  * @iter: iterator to exit
1365  *
1366  * Exit a previously initialized @iter. Must be called with scx_tasks_lock held
1367  * which is released on return. If the iterator holds a task's rq lock, that rq
1368  * lock is also released. See scx_task_iter_start() for details.
1369  */
1370 static void scx_task_iter_stop(struct scx_task_iter *iter)
1371 {
1372         list_del_init(&iter->cursor.tasks_node);
1373         scx_task_iter_unlock(iter);
1374 }
1375
1376 /**
1377  * scx_task_iter_next - Next task
1378  * @iter: iterator to walk
1379  *
1380  * Visit the next task. See scx_task_iter_start() for details. Locks are dropped
1381  * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
1382  * stalls by holding scx_tasks_lock for too long.
1383  */
1384 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
1385 {
1386         struct list_head *cursor = &iter->cursor.tasks_node;
1387         struct sched_ext_entity *pos;
1388
1389         if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
1390                 scx_task_iter_unlock(iter);
1391                 cond_resched();
1392                 scx_task_iter_relock(iter);
1393         }
1394
1395         list_for_each_entry(pos, cursor, tasks_node) {
1396                 if (&pos->tasks_node == &scx_tasks)
1397                         return NULL;
1398                 if (!(pos->flags & SCX_TASK_CURSOR)) {
1399                         list_move(cursor, &pos->tasks_node);
1400                         return container_of(pos, struct task_struct, scx);
1401                 }
1402         }
1403
1404         /* can't happen, should always terminate at scx_tasks above */
1405         BUG();
1406 }
1407
1408 /**
1409  * scx_task_iter_next_locked - Next non-idle task with its rq locked
1410  * @iter: iterator to walk
1411  * @include_dead: Whether we should include dead tasks in the iteration
1412  *
1413  * Visit the non-idle task with its rq lock held. Allows callers to specify
1414  * whether they would like to filter out dead tasks. See scx_task_iter_start()
1415  * for details.
1416  */
1417 static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
1418 {
1419         struct task_struct *p;
1420
1421         __scx_task_iter_rq_unlock(iter);
1422
1423         while ((p = scx_task_iter_next(iter))) {
1424                 /*
1425                  * scx_task_iter is used to prepare and move tasks into SCX
1426                  * while loading the BPF scheduler and vice-versa while
1427                  * unloading. The init_tasks ("swappers") should be excluded
1428                  * from the iteration because:
1429                  *
1430                  * - It's unsafe to use __setschduler_prio() on an init_task to
1431                  *   determine the sched_class to use as it won't preserve its
1432                  *   idle_sched_class.
1433                  *
1434                  * - ops.init/exit_task() can easily be confused if called with
1435                  *   init_tasks as they, e.g., share PID 0.
1436                  *
1437                  * As init_tasks are never scheduled through SCX, they can be
1438                  * skipped safely. Note that is_idle_task() which tests %PF_IDLE
1439                  * doesn't work here:
1440                  *
1441                  * - %PF_IDLE may not be set for an init_task whose CPU hasn't
1442                  *   yet been onlined.
1443                  *
1444                  * - %PF_IDLE can be set on tasks that are not init_tasks. See
1445                  *   play_idle_precise() used by CONFIG_IDLE_INJECT.
1446                  *
1447                  * Test for idle_sched_class as only init_tasks are on it.
1448                  */
1449                 if (p->sched_class != &idle_sched_class)
1450                         break;
1451         }
1452         if (!p)
1453                 return NULL;
1454
1455         iter->rq = task_rq_lock(p, &iter->rf);
1456         iter->locked = p;
1457
1458         return p;
1459 }
1460
1461 static enum scx_ops_enable_state scx_ops_enable_state(void)
1462 {
1463         return atomic_read(&scx_ops_enable_state_var);
1464 }
1465
1466 static enum scx_ops_enable_state
1467 scx_ops_set_enable_state(enum scx_ops_enable_state to)
1468 {
1469         return atomic_xchg(&scx_ops_enable_state_var, to);
1470 }
1471
1472 static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
1473                                         enum scx_ops_enable_state from)
1474 {
1475         int from_v = from;
1476
1477         return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
1478 }
1479
1480 static bool scx_rq_bypassing(struct rq *rq)
1481 {
1482         return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
1483 }
1484
1485 /**
1486  * wait_ops_state - Busy-wait the specified ops state to end
1487  * @p: target task
1488  * @opss: state to wait the end of
1489  *
1490  * Busy-wait for @p to transition out of @opss. This can only be used when the
1491  * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
1492  * has load_acquire semantics to ensure that the caller can see the updates made
1493  * in the enqueueing and dispatching paths.
1494  */
1495 static void wait_ops_state(struct task_struct *p, unsigned long opss)
1496 {
1497         do {
1498                 cpu_relax();
1499         } while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
1500 }
1501
1502 /**
1503  * ops_cpu_valid - Verify a cpu number
1504  * @cpu: cpu number which came from a BPF ops
1505  * @where: extra information reported on error
1506  *
1507  * @cpu is a cpu number which came from the BPF scheduler and can be any value.
1508  * Verify that it is in range and one of the possible cpus. If invalid, trigger
1509  * an ops error.
1510  */
1511 static bool ops_cpu_valid(s32 cpu, const char *where)
1512 {
1513         if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) {
1514                 return true;
1515         } else {
1516                 scx_ops_error("invalid CPU %d%s%s", cpu,
1517                               where ? " " : "", where ?: "");
1518                 return false;
1519         }
1520 }
1521
1522 /**
1523  * ops_sanitize_err - Sanitize a -errno value
1524  * @ops_name: operation to blame on failure
1525  * @err: -errno value to sanitize
1526  *
1527  * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
1528  * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
1529  * cause misbehaviors. For an example, a large negative return from
1530  * ops.init_task() triggers an oops when passed up the call chain because the
1531  * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
1532  * handled as a pointer.
1533  */
1534 static int ops_sanitize_err(const char *ops_name, s32 err)
1535 {
1536         if (err < 0 && err >= -MAX_ERRNO)
1537                 return err;
1538
1539         scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
1540         return -EPROTO;
1541 }
1542
1543 static void run_deferred(struct rq *rq)
1544 {
1545         process_ddsp_deferred_locals(rq);
1546 }
1547
1548 #ifdef CONFIG_SMP
1549 static void deferred_bal_cb_workfn(struct rq *rq)
1550 {
1551         run_deferred(rq);
1552 }
1553 #endif
1554
1555 static void deferred_irq_workfn(struct irq_work *irq_work)
1556 {
1557         struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work);
1558
1559         raw_spin_rq_lock(rq);
1560         run_deferred(rq);
1561         raw_spin_rq_unlock(rq);
1562 }
1563
1564 /**
1565  * schedule_deferred - Schedule execution of deferred actions on an rq
1566  * @rq: target rq
1567  *
1568  * Schedule execution of deferred actions on @rq. Must be called with @rq
1569  * locked. Deferred actions are executed with @rq locked but unpinned, and thus
1570  * can unlock @rq to e.g. migrate tasks to other rqs.
1571  */
1572 static void schedule_deferred(struct rq *rq)
1573 {
1574         lockdep_assert_rq_held(rq);
1575
1576 #ifdef CONFIG_SMP
1577         /*
1578          * If in the middle of waking up a task, task_woken_scx() will be called
1579          * afterwards which will then run the deferred actions, no need to
1580          * schedule anything.
1581          */
1582         if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
1583                 return;
1584
1585         /*
1586          * If in balance, the balance callbacks will be called before rq lock is
1587          * released. Schedule one.
1588          */
1589         if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
1590                 queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
1591                                        deferred_bal_cb_workfn);
1592                 return;
1593         }
1594 #endif
1595         /*
1596          * No scheduler hooks available. Queue an irq work. They are executed on
1597          * IRQ re-enable which may take a bit longer than the scheduler hooks.
1598          * The above WAKEUP and BALANCE paths should cover most of the cases and
1599          * the time to IRQ re-enable shouldn't be long.
1600          */
1601         irq_work_queue(&rq->scx.deferred_irq_work);
1602 }
1603
1604 /**
1605  * touch_core_sched - Update timestamp used for core-sched task ordering
1606  * @rq: rq to read clock from, must be locked
1607  * @p: task to update the timestamp for
1608  *
1609  * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
1610  * implement global or local-DSQ FIFO ordering for core-sched. Should be called
1611  * when a task becomes runnable and its turn on the CPU ends (e.g. slice
1612  * exhaustion).
1613  */
1614 static void touch_core_sched(struct rq *rq, struct task_struct *p)
1615 {
1616         lockdep_assert_rq_held(rq);
1617
1618 #ifdef CONFIG_SCHED_CORE
1619         /*
1620          * It's okay to update the timestamp spuriously. Use
1621          * sched_core_disabled() which is cheaper than enabled().
1622          *
1623          * As this is used to determine ordering between tasks of sibling CPUs,
1624          * it may be better to use per-core dispatch sequence instead.
1625          */
1626         if (!sched_core_disabled())
1627                 p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq));
1628 #endif
1629 }
1630
1631 /**
1632  * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
1633  * @rq: rq to read clock from, must be locked
1634  * @p: task being dispatched
1635  *
1636  * If the BPF scheduler implements custom core-sched ordering via
1637  * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
1638  * ordering within each local DSQ. This function is called from dispatch paths
1639  * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
1640  */
1641 static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
1642 {
1643         lockdep_assert_rq_held(rq);
1644
1645 #ifdef CONFIG_SCHED_CORE
1646         if (SCX_HAS_OP(core_sched_before))
1647                 touch_core_sched(rq, p);
1648 #endif
1649 }
1650
1651 static void update_curr_scx(struct rq *rq)
1652 {
1653         struct task_struct *curr = rq->curr;
1654         s64 delta_exec;
1655
1656         delta_exec = update_curr_common(rq);
1657         if (unlikely(delta_exec <= 0))
1658                 return;
1659
1660         if (curr->scx.slice != SCX_SLICE_INF) {
1661                 curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec);
1662                 if (!curr->scx.slice)
1663                         touch_core_sched(rq, curr);
1664         }
1665 }
1666
1667 static bool scx_dsq_priq_less(struct rb_node *node_a,
1668                               const struct rb_node *node_b)
1669 {
1670         const struct task_struct *a =
1671                 container_of(node_a, struct task_struct, scx.dsq_priq);
1672         const struct task_struct *b =
1673                 container_of(node_b, struct task_struct, scx.dsq_priq);
1674
1675         return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
1676 }
1677
1678 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
1679 {
1680         /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
1681         WRITE_ONCE(dsq->nr, dsq->nr + delta);
1682 }
1683
1684 static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
1685                              u64 enq_flags)
1686 {
1687         bool is_local = dsq->id == SCX_DSQ_LOCAL;
1688
1689         WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
1690         WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||
1691                      !RB_EMPTY_NODE(&p->scx.dsq_priq));
1692
1693         if (!is_local) {
1694                 raw_spin_lock(&dsq->lock);
1695                 if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
1696                         scx_ops_error("attempting to dispatch to a destroyed dsq");
1697                         /* fall back to the global dsq */
1698                         raw_spin_unlock(&dsq->lock);
1699                         dsq = find_global_dsq(p);
1700                         raw_spin_lock(&dsq->lock);
1701                 }
1702         }
1703
1704         if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
1705                      (enq_flags & SCX_ENQ_DSQ_PRIQ))) {
1706                 /*
1707                  * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from
1708                  * their FIFO queues. To avoid confusion and accidentally
1709                  * starving vtime-dispatched tasks by FIFO-dispatched tasks, we
1710                  * disallow any internal DSQ from doing vtime ordering of
1711                  * tasks.
1712                  */
1713                 scx_ops_error("cannot use vtime ordering for built-in DSQs");
1714                 enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
1715         }
1716
1717         if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
1718                 struct rb_node *rbp;
1719
1720                 /*
1721                  * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are
1722                  * linked to both the rbtree and list on PRIQs, this can only be
1723                  * tested easily when adding the first task.
1724                  */
1725                 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
1726                              nldsq_next_task(dsq, NULL, false)))
1727                         scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
1728                                       dsq->id);
1729
1730                 p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
1731                 rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
1732
1733                 /*
1734                  * Find the previous task and insert after it on the list so
1735                  * that @dsq->list is vtime ordered.
1736                  */
1737                 rbp = rb_prev(&p->scx.dsq_priq);
1738                 if (rbp) {
1739                         struct task_struct *prev =
1740                                 container_of(rbp, struct task_struct,
1741                                              scx.dsq_priq);
1742                         list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
1743                 } else {
1744                         list_add(&p->scx.dsq_list.node, &dsq->list);
1745                 }
1746         } else {
1747                 /* a FIFO DSQ shouldn't be using PRIQ enqueuing */
1748                 if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
1749                         scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
1750                                       dsq->id);
1751
1752                 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
1753                         list_add(&p->scx.dsq_list.node, &dsq->list);
1754                 else
1755                         list_add_tail(&p->scx.dsq_list.node, &dsq->list);
1756         }
1757
1758         /* seq records the order tasks are queued, used by BPF DSQ iterator */
1759         dsq->seq++;
1760         p->scx.dsq_seq = dsq->seq;
1761
1762         dsq_mod_nr(dsq, 1);
1763         p->scx.dsq = dsq;
1764
1765         /*
1766          * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
1767          * direct dispatch path, but we clear them here because the direct
1768          * dispatch verdict may be overridden on the enqueue path during e.g.
1769          * bypass.
1770          */
1771         p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
1772         p->scx.ddsp_enq_flags = 0;
1773
1774         /*
1775          * We're transitioning out of QUEUEING or DISPATCHING. store_release to
1776          * match waiters' load_acquire.
1777          */
1778         if (enq_flags & SCX_ENQ_CLEAR_OPSS)
1779                 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1780
1781         if (is_local) {
1782                 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
1783                 bool preempt = false;
1784
1785                 if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
1786                     rq->curr->sched_class == &ext_sched_class) {
1787                         rq->curr->scx.slice = 0;
1788                         preempt = true;
1789                 }
1790
1791                 if (preempt || sched_class_above(&ext_sched_class,
1792                                                  rq->curr->sched_class))
1793                         resched_curr(rq);
1794         } else {
1795                 raw_spin_unlock(&dsq->lock);
1796         }
1797 }
1798
1799 static void task_unlink_from_dsq(struct task_struct *p,
1800                                  struct scx_dispatch_q *dsq)
1801 {
1802         WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));
1803
1804         if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
1805                 rb_erase(&p->scx.dsq_priq, &dsq->priq);
1806                 RB_CLEAR_NODE(&p->scx.dsq_priq);
1807                 p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;
1808         }
1809
1810         list_del_init(&p->scx.dsq_list.node);
1811         dsq_mod_nr(dsq, -1);
1812 }
1813
1814 static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
1815 {
1816         struct scx_dispatch_q *dsq = p->scx.dsq;
1817         bool is_local = dsq == &rq->scx.local_dsq;
1818
1819         if (!dsq) {
1820                 /*
1821                  * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
1822                  * Unlinking is all that's needed to cancel.
1823                  */
1824                 if (unlikely(!list_empty(&p->scx.dsq_list.node)))
1825                         list_del_init(&p->scx.dsq_list.node);
1826
1827                 /*
1828                  * When dispatching directly from the BPF scheduler to a local
1829                  * DSQ, the task isn't associated with any DSQ but
1830                  * @p->scx.holding_cpu may be set under the protection of
1831                  * %SCX_OPSS_DISPATCHING.
1832                  */
1833                 if (p->scx.holding_cpu >= 0)
1834                         p->scx.holding_cpu = -1;
1835
1836                 return;
1837         }
1838
1839         if (!is_local)
1840                 raw_spin_lock(&dsq->lock);
1841
1842         /*
1843          * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't
1844          * change underneath us.
1845         */
1846         if (p->scx.holding_cpu < 0) {
1847                 /* @p must still be on @dsq, dequeue */
1848                 task_unlink_from_dsq(p, dsq);
1849         } else {
1850                 /*
1851                  * We're racing against dispatch_to_local_dsq() which already
1852                  * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
1853                  * holding_cpu which tells dispatch_to_local_dsq() that it lost
1854                  * the race.
1855                  */
1856                 WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
1857                 p->scx.holding_cpu = -1;
1858         }
1859         p->scx.dsq = NULL;
1860
1861         if (!is_local)
1862                 raw_spin_unlock(&dsq->lock);
1863 }
1864
1865 static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
1866                                                     struct task_struct *p)
1867 {
1868         struct scx_dispatch_q *dsq;
1869
1870         if (dsq_id == SCX_DSQ_LOCAL)
1871                 return &rq->scx.local_dsq;
1872
1873         if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
1874                 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
1875
1876                 if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
1877                         return find_global_dsq(p);
1878
1879                 return &cpu_rq(cpu)->scx.local_dsq;
1880         }
1881
1882         if (dsq_id == SCX_DSQ_GLOBAL)
1883                 dsq = find_global_dsq(p);
1884         else
1885                 dsq = find_user_dsq(dsq_id);
1886
1887         if (unlikely(!dsq)) {
1888                 scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
1889                               dsq_id, p->comm, p->pid);
1890                 return find_global_dsq(p);
1891         }
1892
1893         return dsq;
1894 }
1895
1896 static void mark_direct_dispatch(struct task_struct *ddsp_task,
1897                                  struct task_struct *p, u64 dsq_id,
1898                                  u64 enq_flags)
1899 {
1900         /*
1901          * Mark that dispatch already happened from ops.select_cpu() or
1902          * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value
1903          * which can never match a valid task pointer.
1904          */
1905         __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
1906
1907         /* @p must match the task on the enqueue path */
1908         if (unlikely(p != ddsp_task)) {
1909                 if (IS_ERR(ddsp_task))
1910                         scx_ops_error("%s[%d] already direct-dispatched",
1911                                       p->comm, p->pid);
1912                 else
1913                         scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
1914                                       ddsp_task->comm, ddsp_task->pid,
1915                                       p->comm, p->pid);
1916                 return;
1917         }
1918
1919         WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
1920         WARN_ON_ONCE(p->scx.ddsp_enq_flags);
1921
1922         p->scx.ddsp_dsq_id = dsq_id;
1923         p->scx.ddsp_enq_flags = enq_flags;
1924 }
1925
1926 static void direct_dispatch(struct task_struct *p, u64 enq_flags)
1927 {
1928         struct rq *rq = task_rq(p);
1929         struct scx_dispatch_q *dsq =
1930                 find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
1931
1932         touch_core_sched_dispatch(rq, p);
1933
1934         p->scx.ddsp_enq_flags |= enq_flags;
1935
1936         /*
1937          * We are in the enqueue path with @rq locked and pinned, and thus can't
1938          * double lock a remote rq and enqueue to its local DSQ. For
1939          * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer
1940          * the enqueue so that it's executed when @rq can be unlocked.
1941          */
1942         if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) {
1943                 unsigned long opss;
1944
1945                 opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
1946
1947                 switch (opss & SCX_OPSS_STATE_MASK) {
1948                 case SCX_OPSS_NONE:
1949                         break;
1950                 case SCX_OPSS_QUEUEING:
1951                         /*
1952                          * As @p was never passed to the BPF side, _release is
1953                          * not strictly necessary. Still do it for consistency.
1954                          */
1955                         atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1956                         break;
1957                 default:
1958                         WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()",
1959                                   p->comm, p->pid, opss);
1960                         atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1961                         break;
1962                 }
1963
1964                 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
1965                 list_add_tail(&p->scx.dsq_list.node,
1966                               &rq->scx.ddsp_deferred_locals);
1967                 schedule_deferred(rq);
1968                 return;
1969         }
1970
1971         dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
1972 }
1973
1974 static bool scx_rq_online(struct rq *rq)
1975 {
1976         /*
1977          * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates
1978          * the online state as seen from the BPF scheduler. cpu_active() test
1979          * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will
1980          * stay set until the current scheduling operation is complete even if
1981          * we aren't locking @rq.
1982          */
1983         return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));
1984 }
1985
1986 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
1987                             int sticky_cpu)
1988 {
1989         struct task_struct **ddsp_taskp;
1990         unsigned long qseq;
1991
1992         WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
1993
1994         /* rq migration */
1995         if (sticky_cpu == cpu_of(rq))
1996                 goto local_norefill;
1997
1998         /*
1999          * If !scx_rq_online(), we already told the BPF scheduler that the CPU
2000          * is offline and are just running the hotplug path. Don't bother the
2001          * BPF scheduler.
2002          */
2003         if (!scx_rq_online(rq))
2004                 goto local;
2005
2006         if (scx_rq_bypassing(rq))
2007                 goto global;
2008
2009         if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
2010                 goto direct;
2011
2012         /* see %SCX_OPS_ENQ_EXITING */
2013         if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
2014             unlikely(p->flags & PF_EXITING))
2015                 goto local;
2016
2017         if (!SCX_HAS_OP(enqueue))
2018                 goto global;
2019
2020         /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
2021         qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
2022
2023         WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
2024         atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
2025
2026         ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
2027         WARN_ON_ONCE(*ddsp_taskp);
2028         *ddsp_taskp = p;
2029
2030         SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
2031
2032         *ddsp_taskp = NULL;
2033         if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
2034                 goto direct;
2035
2036         /*
2037          * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
2038          * dequeue may be waiting. The store_release matches their load_acquire.
2039          */
2040         atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
2041         return;
2042
2043 direct:
2044         direct_dispatch(p, enq_flags);
2045         return;
2046
2047 local:
2048         /*
2049          * For task-ordering, slice refill must be treated as implying the end
2050          * of the current slice. Otherwise, the longer @p stays on the CPU, the
2051          * higher priority it becomes from scx_prio_less()'s POV.
2052          */
2053         touch_core_sched(rq, p);
2054         p->scx.slice = SCX_SLICE_DFL;
2055 local_norefill:
2056         dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
2057         return;
2058
2059 global:
2060         touch_core_sched(rq, p);        /* see the comment in local: */
2061         p->scx.slice = SCX_SLICE_DFL;
2062         dispatch_enqueue(find_global_dsq(p), p, enq_flags);
2063 }
2064
2065 static bool task_runnable(const struct task_struct *p)
2066 {
2067         return !list_empty(&p->scx.runnable_node);
2068 }
2069
2070 static void set_task_runnable(struct rq *rq, struct task_struct *p)
2071 {
2072         lockdep_assert_rq_held(rq);
2073
2074         if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
2075                 p->scx.runnable_at = jiffies;
2076                 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
2077         }
2078
2079         /*
2080          * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
2081          * appened to the runnable_list.
2082          */
2083         list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
2084 }
2085
2086 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
2087 {
2088         list_del_init(&p->scx.runnable_node);
2089         if (reset_runnable_at)
2090                 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
2091 }
2092
2093 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
2094 {
2095         int sticky_cpu = p->scx.sticky_cpu;
2096
2097         if (enq_flags & ENQUEUE_WAKEUP)
2098                 rq->scx.flags |= SCX_RQ_IN_WAKEUP;
2099
2100         enq_flags |= rq->scx.extra_enq_flags;
2101
2102         if (sticky_cpu >= 0)
2103                 p->scx.sticky_cpu = -1;
2104
2105         /*
2106          * Restoring a running task will be immediately followed by
2107          * set_next_task_scx() which expects the task to not be on the BPF
2108          * scheduler as tasks can only start running through local DSQs. Force
2109          * direct-dispatch into the local DSQ by setting the sticky_cpu.
2110          */
2111         if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
2112                 sticky_cpu = cpu_of(rq);
2113
2114         if (p->scx.flags & SCX_TASK_QUEUED) {
2115                 WARN_ON_ONCE(!task_runnable(p));
2116                 goto out;
2117         }
2118
2119         set_task_runnable(rq, p);
2120         p->scx.flags |= SCX_TASK_QUEUED;
2121         rq->scx.nr_running++;
2122         add_nr_running(rq, 1);
2123
2124         if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
2125                 SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
2126
2127         if (enq_flags & SCX_ENQ_WAKEUP)
2128                 touch_core_sched(rq, p);
2129
2130         do_enqueue_task(rq, p, enq_flags, sticky_cpu);
2131 out:
2132         rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
2133 }
2134
2135 static void ops_dequeue(struct task_struct *p, u64 deq_flags)
2136 {
2137         unsigned long opss;
2138
2139         /* dequeue is always temporary, don't reset runnable_at */
2140         clr_task_runnable(p, false);
2141
2142         /* acquire ensures that we see the preceding updates on QUEUED */
2143         opss = atomic_long_read_acquire(&p->scx.ops_state);
2144
2145         switch (opss & SCX_OPSS_STATE_MASK) {
2146         case SCX_OPSS_NONE:
2147                 break;
2148         case SCX_OPSS_QUEUEING:
2149                 /*
2150                  * QUEUEING is started and finished while holding @p's rq lock.
2151                  * As we're holding the rq lock now, we shouldn't see QUEUEING.
2152                  */
2153                 BUG();
2154         case SCX_OPSS_QUEUED:
2155                 if (SCX_HAS_OP(dequeue))
2156                         SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
2157
2158                 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
2159                                             SCX_OPSS_NONE))
2160                         break;
2161                 fallthrough;
2162         case SCX_OPSS_DISPATCHING:
2163                 /*
2164                  * If @p is being dispatched from the BPF scheduler to a DSQ,
2165                  * wait for the transfer to complete so that @p doesn't get
2166                  * added to its DSQ after dequeueing is complete.
2167                  *
2168                  * As we're waiting on DISPATCHING with the rq locked, the
2169                  * dispatching side shouldn't try to lock the rq while
2170                  * DISPATCHING is set. See dispatch_to_local_dsq().
2171                  *
2172                  * DISPATCHING shouldn't have qseq set and control can reach
2173                  * here with NONE @opss from the above QUEUED case block.
2174                  * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
2175                  */
2176                 wait_ops_state(p, SCX_OPSS_DISPATCHING);
2177                 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
2178                 break;
2179         }
2180 }
2181
2182 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
2183 {
2184         if (!(p->scx.flags & SCX_TASK_QUEUED)) {
2185                 WARN_ON_ONCE(task_runnable(p));
2186                 return true;
2187         }
2188
2189         ops_dequeue(p, deq_flags);
2190
2191         /*
2192          * A currently running task which is going off @rq first gets dequeued
2193          * and then stops running. As we want running <-> stopping transitions
2194          * to be contained within runnable <-> quiescent transitions, trigger
2195          * ->stopping() early here instead of in put_prev_task_scx().
2196          *
2197          * @p may go through multiple stopping <-> running transitions between
2198          * here and put_prev_task_scx() if task attribute changes occur while
2199          * balance_scx() leaves @rq unlocked. However, they don't contain any
2200          * information meaningful to the BPF scheduler and can be suppressed by
2201          * skipping the callbacks if the task is !QUEUED.
2202          */
2203         if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
2204                 update_curr_scx(rq);
2205                 SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
2206         }
2207
2208         if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
2209                 SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
2210
2211         if (deq_flags & SCX_DEQ_SLEEP)
2212                 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
2213         else
2214                 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
2215
2216         p->scx.flags &= ~SCX_TASK_QUEUED;
2217         rq->scx.nr_running--;
2218         sub_nr_running(rq, 1);
2219
2220         dispatch_dequeue(rq, p);
2221         return true;
2222 }
2223
2224 static void yield_task_scx(struct rq *rq)
2225 {
2226         struct task_struct *p = rq->curr;
2227
2228         if (SCX_HAS_OP(yield))
2229                 SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
2230         else
2231                 p->scx.slice = 0;
2232 }
2233
2234 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
2235 {
2236         struct task_struct *from = rq->curr;
2237
2238         if (SCX_HAS_OP(yield))
2239                 return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
2240         else
2241                 return false;
2242 }
2243
2244 static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
2245                                          struct scx_dispatch_q *src_dsq,
2246                                          struct rq *dst_rq)
2247 {
2248         struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;
2249
2250         /* @dsq is locked and @p is on @dst_rq */
2251         lockdep_assert_held(&src_dsq->lock);
2252         lockdep_assert_rq_held(dst_rq);
2253
2254         WARN_ON_ONCE(p->scx.holding_cpu >= 0);
2255
2256         if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
2257                 list_add(&p->scx.dsq_list.node, &dst_dsq->list);
2258         else
2259                 list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);
2260
2261         dsq_mod_nr(dst_dsq, 1);
2262         p->scx.dsq = dst_dsq;
2263 }
2264
2265 #ifdef CONFIG_SMP
2266 /**
2267  * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
2268  * @p: task to move
2269  * @enq_flags: %SCX_ENQ_*
2270  * @src_rq: rq to move the task from, locked on entry, released on return
2271  * @dst_rq: rq to move the task into, locked on return
2272  *
2273  * Move @p which is currently on @src_rq to @dst_rq's local DSQ.
2274  */
2275 static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
2276                                           struct rq *src_rq, struct rq *dst_rq)
2277 {
2278         lockdep_assert_rq_held(src_rq);
2279
2280         /* the following marks @p MIGRATING which excludes dequeue */
2281         deactivate_task(src_rq, p, 0);
2282         set_task_cpu(p, cpu_of(dst_rq));
2283         p->scx.sticky_cpu = cpu_of(dst_rq);
2284
2285         raw_spin_rq_unlock(src_rq);
2286         raw_spin_rq_lock(dst_rq);
2287
2288         /*
2289          * We want to pass scx-specific enq_flags but activate_task() will
2290          * truncate the upper 32 bit. As we own @rq, we can pass them through
2291          * @rq->scx.extra_enq_flags instead.
2292          */
2293         WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));
2294         WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);
2295         dst_rq->scx.extra_enq_flags = enq_flags;
2296         activate_task(dst_rq, p, 0);
2297         dst_rq->scx.extra_enq_flags = 0;
2298 }
2299
2300 /*
2301  * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two
2302  * differences:
2303  *
2304  * - is_cpu_allowed() asks "Can this task run on this CPU?" while
2305  *   task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to
2306  *   this CPU?".
2307  *
2308  *   While migration is disabled, is_cpu_allowed() has to say "yes" as the task
2309  *   must be allowed to finish on the CPU that it's currently on regardless of
2310  *   the CPU state. However, task_can_run_on_remote_rq() must say "no" as the
2311  *   BPF scheduler shouldn't attempt to migrate a task which has migration
2312  *   disabled.
2313  *
2314  * - The BPF scheduler is bypassed while the rq is offline and we can always say
2315  *   no to the BPF scheduler initiated migrations while offline.
2316  */
2317 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
2318                                       bool trigger_error)
2319 {
2320         int cpu = cpu_of(rq);
2321
2322         /*
2323          * We don't require the BPF scheduler to avoid dispatching to offline
2324          * CPUs mostly for convenience but also because CPUs can go offline
2325          * between scx_bpf_dsq_insert() calls and here. Trigger error iff the
2326          * picked CPU is outside the allowed mask.
2327          */
2328         if (!task_allowed_on_cpu(p, cpu)) {
2329                 if (trigger_error)
2330                         scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
2331                                       cpu_of(rq), p->comm, p->pid);
2332                 return false;
2333         }
2334
2335         if (unlikely(is_migration_disabled(p)))
2336                 return false;
2337
2338         if (!scx_rq_online(rq))
2339                 return false;
2340
2341         return true;
2342 }
2343
2344 /**
2345  * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq
2346  * @p: target task
2347  * @dsq: locked DSQ @p is currently on
2348  * @src_rq: rq @p is currently on, stable with @dsq locked
2349  *
2350  * Called with @dsq locked but no rq's locked. We want to move @p to a different
2351  * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is
2352  * required when transferring into a local DSQ. Even when transferring into a
2353  * non-local DSQ, it's better to use the same mechanism to protect against
2354  * dequeues and maintain the invariant that @p->scx.dsq can only change while
2355  * @src_rq is locked, which e.g. scx_dump_task() depends on.
2356  *
2357  * We want to grab @src_rq but that can deadlock if we try while locking @dsq,
2358  * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As
2359  * this may race with dequeue, which can't drop the rq lock or fail, do a little
2360  * dancing from our side.
2361  *
2362  * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets
2363  * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu
2364  * would be cleared to -1. While other cpus may have updated it to different
2365  * values afterwards, as this operation can't be preempted or recurse, the
2366  * holding_cpu can never become this CPU again before we're done. Thus, we can
2367  * tell whether we lost to dequeue by testing whether the holding_cpu still
2368  * points to this CPU. See dispatch_dequeue() for the counterpart.
2369  *
2370  * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is
2371  * still valid. %false if lost to dequeue.
2372  */
2373 static bool unlink_dsq_and_lock_src_rq(struct task_struct *p,
2374                                        struct scx_dispatch_q *dsq,
2375                                        struct rq *src_rq)
2376 {
2377         s32 cpu = raw_smp_processor_id();
2378
2379         lockdep_assert_held(&dsq->lock);
2380
2381         WARN_ON_ONCE(p->scx.holding_cpu >= 0);
2382         task_unlink_from_dsq(p, dsq);
2383         p->scx.holding_cpu = cpu;
2384
2385         raw_spin_unlock(&dsq->lock);
2386         raw_spin_rq_lock(src_rq);
2387
2388         /* task_rq couldn't have changed if we're still the holding cpu */
2389         return likely(p->scx.holding_cpu == cpu) &&
2390                 !WARN_ON_ONCE(src_rq != task_rq(p));
2391 }
2392
2393 static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
2394                                 struct scx_dispatch_q *dsq, struct rq *src_rq)
2395 {
2396         raw_spin_rq_unlock(this_rq);
2397
2398         if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) {
2399                 move_remote_task_to_local_dsq(p, 0, src_rq, this_rq);
2400                 return true;
2401         } else {
2402                 raw_spin_rq_unlock(src_rq);
2403                 raw_spin_rq_lock(this_rq);
2404                 return false;
2405         }
2406 }
2407 #else   /* CONFIG_SMP */
2408 static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
2409 static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
2410 static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
2411 #endif  /* CONFIG_SMP */
2412
2413 /**
2414  * move_task_between_dsqs() - Move a task from one DSQ to another
2415  * @p: target task
2416  * @enq_flags: %SCX_ENQ_*
2417  * @src_dsq: DSQ @p is currently on, must not be a local DSQ
2418  * @dst_dsq: DSQ @p is being moved to, can be any DSQ
2419  *
2420  * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local
2421  * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq
2422  * will change. As @p's task_rq is locked, this function doesn't need to use the
2423  * holding_cpu mechanism.
2424  *
2425  * On return, @src_dsq is unlocked and only @p's new task_rq, which is the
2426  * return value, is locked.
2427  */
2428 static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
2429                                          struct scx_dispatch_q *src_dsq,
2430                                          struct scx_dispatch_q *dst_dsq)
2431 {
2432         struct rq *src_rq = task_rq(p), *dst_rq;
2433
2434         BUG_ON(src_dsq->id == SCX_DSQ_LOCAL);
2435         lockdep_assert_held(&src_dsq->lock);
2436         lockdep_assert_rq_held(src_rq);
2437
2438         if (dst_dsq->id == SCX_DSQ_LOCAL) {
2439                 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
2440                 if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
2441                         dst_dsq = find_global_dsq(p);
2442                         dst_rq = src_rq;
2443                 }
2444         } else {
2445                 /* no need to migrate if destination is a non-local DSQ */
2446                 dst_rq = src_rq;
2447         }
2448
2449         /*
2450          * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
2451          * CPU, @p will be migrated.
2452          */
2453         if (dst_dsq->id == SCX_DSQ_LOCAL) {
2454                 /* @p is going from a non-local DSQ to a local DSQ */
2455                 if (src_rq == dst_rq) {
2456                         task_unlink_from_dsq(p, src_dsq);
2457                         move_local_task_to_local_dsq(p, enq_flags,
2458                                                      src_dsq, dst_rq);
2459                         raw_spin_unlock(&src_dsq->lock);
2460                 } else {
2461                         raw_spin_unlock(&src_dsq->lock);
2462                         move_remote_task_to_local_dsq(p, enq_flags,
2463                                                       src_rq, dst_rq);
2464                 }
2465         } else {
2466                 /*
2467                  * @p is going from a non-local DSQ to a non-local DSQ. As
2468                  * $src_dsq is already locked, do an abbreviated dequeue.
2469                  */
2470                 task_unlink_from_dsq(p, src_dsq);
2471                 p->scx.dsq = NULL;
2472                 raw_spin_unlock(&src_dsq->lock);
2473
2474                 dispatch_enqueue(dst_dsq, p, enq_flags);
2475         }
2476
2477         return dst_rq;
2478 }
2479
2480 /*
2481  * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
2482  * banging on the same DSQ on a large NUMA system to the point where switching
2483  * to the bypass mode can take a long time. Inject artifical delays while the
2484  * bypass mode is switching to guarantee timely completion.
2485  */
2486 static void scx_ops_breather(struct rq *rq)
2487 {
2488         u64 until;
2489
2490         lockdep_assert_rq_held(rq);
2491
2492         if (likely(!atomic_read(&scx_ops_breather_depth)))
2493                 return;
2494
2495         raw_spin_rq_unlock(rq);
2496
2497         until = ktime_get_ns() + NSEC_PER_MSEC;
2498
2499         do {
2500                 int cnt = 1024;
2501                 while (atomic_read(&scx_ops_breather_depth) && --cnt)
2502                         cpu_relax();
2503         } while (atomic_read(&scx_ops_breather_depth) &&
2504                  time_before64(ktime_get_ns(), until));
2505
2506         raw_spin_rq_lock(rq);
2507 }
2508
2509 static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
2510 {
2511         struct task_struct *p;
2512 retry:
2513         /*
2514          * This retry loop can repeatedly race against scx_ops_bypass()
2515          * dequeueing tasks from @dsq trying to put the system into the bypass
2516          * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can
2517          * live-lock the machine into soft lockups. Give a breather.
2518          */
2519         scx_ops_breather(rq);
2520
2521         /*
2522          * The caller can't expect to successfully consume a task if the task's
2523          * addition to @dsq isn't guaranteed to be visible somehow. Test
2524          * @dsq->list without locking and skip if it seems empty.
2525          */
2526         if (list_empty(&dsq->list))
2527                 return false;
2528
2529         raw_spin_lock(&dsq->lock);
2530
2531         nldsq_for_each_task(p, dsq) {
2532                 struct rq *task_rq = task_rq(p);
2533
2534                 if (rq == task_rq) {
2535                         task_unlink_from_dsq(p, dsq);
2536                         move_local_task_to_local_dsq(p, 0, dsq, rq);
2537                         raw_spin_unlock(&dsq->lock);
2538                         return true;
2539                 }
2540
2541                 if (task_can_run_on_remote_rq(p, rq, false)) {
2542                         if (likely(consume_remote_task(rq, p, dsq, task_rq)))
2543                                 return true;
2544                         goto retry;
2545                 }
2546         }
2547
2548         raw_spin_unlock(&dsq->lock);
2549         return false;
2550 }
2551
2552 static bool consume_global_dsq(struct rq *rq)
2553 {
2554         int node = cpu_to_node(cpu_of(rq));
2555
2556         return consume_dispatch_q(rq, global_dsqs[node]);
2557 }
2558
2559 /**
2560  * dispatch_to_local_dsq - Dispatch a task to a local dsq
2561  * @rq: current rq which is locked
2562  * @dst_dsq: destination DSQ
2563  * @p: task to dispatch
2564  * @enq_flags: %SCX_ENQ_*
2565  *
2566  * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local
2567  * DSQ. This function performs all the synchronization dancing needed because
2568  * local DSQs are protected with rq locks.
2569  *
2570  * The caller must have exclusive ownership of @p (e.g. through
2571  * %SCX_OPSS_DISPATCHING).
2572  */
2573 static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
2574                                   struct task_struct *p, u64 enq_flags)
2575 {
2576         struct rq *src_rq = task_rq(p);
2577         struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
2578
2579         /*
2580          * We're synchronized against dequeue through DISPATCHING. As @p can't
2581          * be dequeued, its task_rq and cpus_allowed are stable too.
2582          *
2583          * If dispatching to @rq that @p is already on, no lock dancing needed.
2584          */
2585         if (rq == src_rq && rq == dst_rq) {
2586                 dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
2587                 return;
2588         }
2589
2590 #ifdef CONFIG_SMP
2591         if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
2592                 dispatch_enqueue(find_global_dsq(p), p,
2593                                  enq_flags | SCX_ENQ_CLEAR_OPSS);
2594                 return;
2595         }
2596
2597         /*
2598          * @p is on a possibly remote @src_rq which we need to lock to move the
2599          * task. If dequeue is in progress, it'd be locking @src_rq and waiting
2600          * on DISPATCHING, so we can't grab @src_rq lock while holding
2601          * DISPATCHING.
2602          *
2603          * As DISPATCHING guarantees that @p is wholly ours, we can pretend that
2604          * we're moving from a DSQ and use the same mechanism - mark the task
2605          * under transfer with holding_cpu, release DISPATCHING and then follow
2606          * the same protocol. See unlink_dsq_and_lock_src_rq().
2607          */
2608         p->scx.holding_cpu = raw_smp_processor_id();
2609
2610         /* store_release ensures that dequeue sees the above */
2611         atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
2612
2613         /* switch to @src_rq lock */
2614         if (rq != src_rq) {
2615                 raw_spin_rq_unlock(rq);
2616                 raw_spin_rq_lock(src_rq);
2617         }
2618
2619         /* task_rq couldn't have changed if we're still the holding cpu */
2620         if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
2621             !WARN_ON_ONCE(src_rq != task_rq(p))) {
2622                 /*
2623                  * If @p is staying on the same rq, there's no need to go
2624                  * through the full deactivate/activate cycle. Optimize by
2625                  * abbreviating move_remote_task_to_local_dsq().
2626                  */
2627                 if (src_rq == dst_rq) {
2628                         p->scx.holding_cpu = -1;
2629                         dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);
2630                 } else {
2631                         move_remote_task_to_local_dsq(p, enq_flags,
2632                                                       src_rq, dst_rq);
2633                 }
2634
2635                 /* if the destination CPU is idle, wake it up */
2636                 if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))
2637                         resched_curr(dst_rq);
2638         }
2639
2640         /* switch back to @rq lock */
2641         if (rq != dst_rq) {
2642                 raw_spin_rq_unlock(dst_rq);
2643                 raw_spin_rq_lock(rq);
2644         }
2645 #else   /* CONFIG_SMP */
2646         BUG();  /* control can not reach here on UP */
2647 #endif  /* CONFIG_SMP */
2648 }
2649
2650 /**
2651  * finish_dispatch - Asynchronously finish dispatching a task
2652  * @rq: current rq which is locked
2653  * @p: task to finish dispatching
2654  * @qseq_at_dispatch: qseq when @p started getting dispatched
2655  * @dsq_id: destination DSQ ID
2656  * @enq_flags: %SCX_ENQ_*
2657  *
2658  * Dispatching to local DSQs may need to wait for queueing to complete or
2659  * require rq lock dancing. As we don't wanna do either while inside
2660  * ops.dispatch() to avoid locking order inversion, we split dispatching into
2661  * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the
2662  * task and its qseq. Once ops.dispatch() returns, this function is called to
2663  * finish up.
2664  *
2665  * There is no guarantee that @p is still valid for dispatching or even that it
2666  * was valid in the first place. Make sure that the task is still owned by the
2667  * BPF scheduler and claim the ownership before dispatching.
2668  */
2669 static void finish_dispatch(struct rq *rq, struct task_struct *p,
2670                             unsigned long qseq_at_dispatch,
2671                             u64 dsq_id, u64 enq_flags)
2672 {
2673         struct scx_dispatch_q *dsq;
2674         unsigned long opss;
2675
2676         touch_core_sched_dispatch(rq, p);
2677 retry:
2678         /*
2679          * No need for _acquire here. @p is accessed only after a successful
2680          * try_cmpxchg to DISPATCHING.
2681          */
2682         opss = atomic_long_read(&p->scx.ops_state);
2683
2684         switch (opss & SCX_OPSS_STATE_MASK) {
2685         case SCX_OPSS_DISPATCHING:
2686         case SCX_OPSS_NONE:
2687                 /* someone else already got to it */
2688                 return;
2689         case SCX_OPSS_QUEUED:
2690                 /*
2691                  * If qseq doesn't match, @p has gone through at least one
2692                  * dispatch/dequeue and re-enqueue cycle between
2693                  * scx_bpf_dsq_insert() and here and we have no claim on it.
2694                  */
2695                 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
2696                         return;
2697
2698                 /*
2699                  * While we know @p is accessible, we don't yet have a claim on
2700                  * it - the BPF scheduler is allowed to dispatch tasks
2701                  * spuriously and there can be a racing dequeue attempt. Let's
2702                  * claim @p by atomically transitioning it from QUEUED to
2703                  * DISPATCHING.
2704                  */
2705                 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
2706                                                    SCX_OPSS_DISPATCHING)))
2707                         break;
2708                 goto retry;
2709         case SCX_OPSS_QUEUEING:
2710                 /*
2711                  * do_enqueue_task() is in the process of transferring the task
2712                  * to the BPF scheduler while holding @p's rq lock. As we aren't
2713                  * holding any kernel or BPF resource that the enqueue path may
2714                  * depend upon, it's safe to wait.
2715                  */
2716                 wait_ops_state(p, opss);
2717                 goto retry;
2718         }
2719
2720         BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
2721
2722         dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
2723
2724         if (dsq->id == SCX_DSQ_LOCAL)
2725                 dispatch_to_local_dsq(rq, dsq, p, enq_flags);
2726         else
2727                 dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
2728 }
2729
2730 static void flush_dispatch_buf(struct rq *rq)
2731 {
2732         struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
2733         u32 u;
2734
2735         for (u = 0; u < dspc->cursor; u++) {
2736                 struct scx_dsp_buf_ent *ent = &dspc->buf[u];
2737
2738                 finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id,
2739                                 ent->enq_flags);
2740         }
2741
2742         dspc->nr_tasks += dspc->cursor;
2743         dspc->cursor = 0;
2744 }
2745
2746 static int balance_one(struct rq *rq, struct task_struct *prev)
2747 {
2748         struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
2749         bool prev_on_scx = prev->sched_class == &ext_sched_class;
2750         int nr_loops = SCX_DSP_MAX_LOOPS;
2751
2752         lockdep_assert_rq_held(rq);
2753         rq->scx.flags |= SCX_RQ_IN_BALANCE;
2754         rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
2755
2756         if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
2757             unlikely(rq->scx.cpu_released)) {
2758                 /*
2759                  * If the previous sched_class for the current CPU was not SCX,
2760                  * notify the BPF scheduler that it again has control of the
2761                  * core. This callback complements ->cpu_release(), which is
2762                  * emitted in switch_class().
2763                  */
2764                 if (SCX_HAS_OP(cpu_acquire))
2765                         SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
2766                 rq->scx.cpu_released = false;
2767         }
2768
2769         if (prev_on_scx) {
2770                 update_curr_scx(rq);
2771
2772                 /*
2773                  * If @prev is runnable & has slice left, it has priority and
2774                  * fetching more just increases latency for the fetched tasks.
2775                  * Tell pick_task_scx() to keep running @prev. If the BPF
2776                  * scheduler wants to handle this explicitly, it should
2777                  * implement ->cpu_release().
2778                  *
2779                  * See scx_ops_disable_workfn() for the explanation on the
2780                  * bypassing test.
2781                  */
2782                 if ((prev->scx.flags & SCX_TASK_QUEUED) &&
2783                     prev->scx.slice && !scx_rq_bypassing(rq)) {
2784                         rq->scx.flags |= SCX_RQ_BAL_KEEP;
2785                         goto has_tasks;
2786                 }
2787         }
2788
2789         /* if there already are tasks to run, nothing to do */
2790         if (rq->scx.local_dsq.nr)
2791                 goto has_tasks;
2792
2793         if (consume_global_dsq(rq))
2794                 goto has_tasks;
2795
2796         if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
2797                 goto no_tasks;
2798
2799         dspc->rq = rq;
2800
2801         /*
2802          * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
2803          * the local DSQ might still end up empty after a successful
2804          * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
2805          * produced some tasks, retry. The BPF scheduler may depend on this
2806          * looping behavior to simplify its implementation.
2807          */
2808         do {
2809                 dspc->nr_tasks = 0;
2810
2811                 SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
2812                             prev_on_scx ? prev : NULL);
2813
2814                 flush_dispatch_buf(rq);
2815
2816                 if (rq->scx.local_dsq.nr)
2817                         goto has_tasks;
2818                 if (consume_global_dsq(rq))
2819                         goto has_tasks;
2820
2821                 /*
2822                  * ops.dispatch() can trap us in this loop by repeatedly
2823                  * dispatching ineligible tasks. Break out once in a while to
2824                  * allow the watchdog to run. As IRQ can't be enabled in
2825                  * balance(), we want to complete this scheduling cycle and then
2826                  * start a new one. IOW, we want to call resched_curr() on the
2827                  * next, most likely idle, task, not the current one. Use
2828                  * scx_bpf_kick_cpu() for deferred kicking.
2829                  */
2830                 if (unlikely(!--nr_loops)) {
2831                         scx_bpf_kick_cpu(cpu_of(rq), 0);
2832                         break;
2833                 }
2834         } while (dspc->nr_tasks);
2835
2836 no_tasks:
2837         /*
2838          * Didn't find another task to run. Keep running @prev unless
2839          * %SCX_OPS_ENQ_LAST is in effect.
2840          */
2841         if ((prev->scx.flags & SCX_TASK_QUEUED) &&
2842             (!static_branch_unlikely(&scx_ops_enq_last) ||
2843              scx_rq_bypassing(rq))) {
2844                 rq->scx.flags |= SCX_RQ_BAL_KEEP;
2845                 goto has_tasks;
2846         }
2847         rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
2848         return false;
2849
2850 has_tasks:
2851         rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
2852         return true;
2853 }
2854
2855 static int balance_scx(struct rq *rq, struct task_struct *prev,
2856                        struct rq_flags *rf)
2857 {
2858         int ret;
2859
2860         rq_unpin_lock(rq, rf);
2861
2862         ret = balance_one(rq, prev);
2863
2864 #ifdef CONFIG_SCHED_SMT
2865         /*
2866          * When core-sched is enabled, this ops.balance() call will be followed
2867          * by pick_task_scx() on this CPU and the SMT siblings. Balance the
2868          * siblings too.
2869          */
2870         if (sched_core_enabled(rq)) {
2871                 const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
2872                 int scpu;
2873
2874                 for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
2875                         struct rq *srq = cpu_rq(scpu);
2876                         struct task_struct *sprev = srq->curr;
2877
2878                         WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
2879                         update_rq_clock(srq);
2880                         balance_one(srq, sprev);
2881                 }
2882         }
2883 #endif
2884         rq_repin_lock(rq, rf);
2885
2886         return ret;
2887 }
2888
2889 static void process_ddsp_deferred_locals(struct rq *rq)
2890 {
2891         struct task_struct *p;
2892
2893         lockdep_assert_rq_held(rq);
2894
2895         /*
2896          * Now that @rq can be unlocked, execute the deferred enqueueing of
2897          * tasks directly dispatched to the local DSQs of other CPUs. See
2898          * direct_dispatch(). Keep popping from the head instead of using
2899          * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
2900          * temporarily.
2901          */
2902         while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
2903                                 struct task_struct, scx.dsq_list.node))) {
2904                 struct scx_dispatch_q *dsq;
2905
2906                 list_del_init(&p->scx.dsq_list.node);
2907
2908                 dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
2909                 if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
2910                         dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);
2911         }
2912 }
2913
2914 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
2915 {
2916         if (p->scx.flags & SCX_TASK_QUEUED) {
2917                 /*
2918                  * Core-sched might decide to execute @p before it is
2919                  * dispatched. Call ops_dequeue() to notify the BPF scheduler.
2920                  */
2921                 ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
2922                 dispatch_dequeue(rq, p);
2923         }
2924
2925         p->se.exec_start = rq_clock_task(rq);
2926
2927         /* see dequeue_task_scx() on why we skip when !QUEUED */
2928         if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
2929                 SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
2930
2931         clr_task_runnable(p, true);
2932
2933         /*
2934          * @p is getting newly scheduled or got kicked after someone updated its
2935          * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
2936          */
2937         if ((p->scx.slice == SCX_SLICE_INF) !=
2938             (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
2939                 if (p->scx.slice == SCX_SLICE_INF)
2940                         rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
2941                 else
2942                         rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
2943
2944                 sched_update_tick_dependency(rq);
2945
2946                 /*
2947                  * For now, let's refresh the load_avgs just when transitioning
2948                  * in and out of nohz. In the future, we might want to add a
2949                  * mechanism which calls the following periodically on
2950                  * tick-stopped CPUs.
2951                  */
2952                 update_other_load_avgs(rq);
2953         }
2954 }
2955
2956 static enum scx_cpu_preempt_reason
2957 preempt_reason_from_class(const struct sched_class *class)
2958 {
2959 #ifdef CONFIG_SMP
2960         if (class == &stop_sched_class)
2961                 return SCX_CPU_PREEMPT_STOP;
2962 #endif
2963         if (class == &dl_sched_class)
2964                 return SCX_CPU_PREEMPT_DL;
2965         if (class == &rt_sched_class)
2966                 return SCX_CPU_PREEMPT_RT;
2967         return SCX_CPU_PREEMPT_UNKNOWN;
2968 }
2969
2970 static void switch_class(struct rq *rq, struct task_struct *next)
2971 {
2972         const struct sched_class *next_class = next->sched_class;
2973
2974 #ifdef CONFIG_SMP
2975         /*
2976          * Pairs with the smp_load_acquire() issued by a CPU in
2977          * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
2978          * resched.
2979          */
2980         smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
2981 #endif
2982         if (!static_branch_unlikely(&scx_ops_cpu_preempt))
2983                 return;
2984
2985         /*
2986          * The callback is conceptually meant to convey that the CPU is no
2987          * longer under the control of SCX. Therefore, don't invoke the callback
2988          * if the next class is below SCX (in which case the BPF scheduler has
2989          * actively decided not to schedule any tasks on the CPU).
2990          */
2991         if (sched_class_above(&ext_sched_class, next_class))
2992                 return;
2993
2994         /*
2995          * At this point we know that SCX was preempted by a higher priority
2996          * sched_class, so invoke the ->cpu_release() callback if we have not
2997          * done so already. We only send the callback once between SCX being
2998          * preempted, and it regaining control of the CPU.
2999          *
3000          * ->cpu_release() complements ->cpu_acquire(), which is emitted the
3001          *  next time that balance_scx() is invoked.
3002          */
3003         if (!rq->scx.cpu_released) {
3004                 if (SCX_HAS_OP(cpu_release)) {
3005                         struct scx_cpu_release_args args = {
3006                                 .reason = preempt_reason_from_class(next_class),
3007                                 .task = next,
3008                         };
3009
3010                         SCX_CALL_OP(SCX_KF_CPU_RELEASE,
3011                                     cpu_release, cpu_of(rq), &args);
3012                 }
3013                 rq->scx.cpu_released = true;
3014         }
3015 }
3016
3017 static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
3018                               struct task_struct *next)
3019 {
3020         update_curr_scx(rq);
3021
3022         /* see dequeue_task_scx() on why we skip when !QUEUED */
3023         if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
3024                 SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
3025
3026         if (p->scx.flags & SCX_TASK_QUEUED) {
3027                 set_task_runnable(rq, p);
3028
3029                 /*
3030                  * If @p has slice left and is being put, @p is getting
3031                  * preempted by a higher priority scheduler class or core-sched
3032                  * forcing a different task. Leave it at the head of the local
3033                  * DSQ.
3034                  */
3035                 if (p->scx.slice && !scx_rq_bypassing(rq)) {
3036                         dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
3037                         return;
3038                 }
3039
3040                 /*
3041                  * If @p is runnable but we're about to enter a lower
3042                  * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell
3043                  * ops.enqueue() that @p is the only one available for this cpu,
3044                  * which should trigger an explicit follow-up scheduling event.
3045                  */
3046                 if (sched_class_above(&ext_sched_class, next->sched_class)) {
3047                         WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last));
3048                         do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
3049                 } else {
3050                         do_enqueue_task(rq, p, 0, -1);
3051                 }
3052         }
3053
3054         if (next && next->sched_class != &ext_sched_class)
3055                 switch_class(rq, next);
3056 }
3057
3058 static struct task_struct *first_local_task(struct rq *rq)
3059 {
3060         return list_first_entry_or_null(&rq->scx.local_dsq.list,
3061                                         struct task_struct, scx.dsq_list.node);
3062 }
3063
3064 static struct task_struct *pick_task_scx(struct rq *rq)
3065 {
3066         struct task_struct *prev = rq->curr;
3067         struct task_struct *p;
3068         bool prev_on_scx = prev->sched_class == &ext_sched_class;
3069         bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
3070         bool kick_idle = false;
3071
3072         /*
3073          * WORKAROUND:
3074          *
3075          * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
3076          * have gone through balance_scx(). Unfortunately, there currently is a
3077          * bug where fair could say yes on balance() but no on pick_task(),
3078          * which then ends up calling pick_task_scx() without preceding
3079          * balance_scx().
3080          *
3081          * Keep running @prev if possible and avoid stalling from entering idle
3082          * without balancing.
3083          *
3084          * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
3085          * if pick_task_scx() is called without preceding balance_scx().
3086          */
3087         if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
3088                 if (prev_on_scx) {
3089                         keep_prev = true;
3090                 } else {
3091                         keep_prev = false;
3092                         kick_idle = true;
3093                 }
3094         } else if (unlikely(keep_prev && !prev_on_scx)) {
3095                 /* only allowed during transitions */
3096                 WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
3097                 keep_prev = false;
3098         }
3099
3100         /*
3101          * If balance_scx() is telling us to keep running @prev, replenish slice
3102          * if necessary and keep running @prev. Otherwise, pop the first one
3103          * from the local DSQ.
3104          */
3105         if (keep_prev) {
3106                 p = prev;
3107                 if (!p->scx.slice)
3108                         p->scx.slice = SCX_SLICE_DFL;
3109         } else {
3110                 p = first_local_task(rq);
3111                 if (!p) {
3112                         if (kick_idle)
3113                                 scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
3114                         return NULL;
3115                 }
3116
3117                 if (unlikely(!p->scx.slice)) {
3118                         if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
3119                                 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
3120                                                 p->comm, p->pid, __func__);
3121                                 scx_warned_zero_slice = true;
3122                         }
3123                         p->scx.slice = SCX_SLICE_DFL;
3124                 }
3125         }
3126
3127         return p;
3128 }
3129
3130 #ifdef CONFIG_SCHED_CORE
3131 /**
3132  * scx_prio_less - Task ordering for core-sched
3133  * @a: task A
3134  * @b: task B
3135  *
3136  * Core-sched is implemented as an additional scheduling layer on top of the
3137  * usual sched_class'es and needs to find out the expected task ordering. For
3138  * SCX, core-sched calls this function to interrogate the task ordering.
3139  *
3140  * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
3141  * to implement the default task ordering. The older the timestamp, the higher
3142  * prority the task - the global FIFO ordering matching the default scheduling
3143  * behavior.
3144  *
3145  * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
3146  * implement FIFO ordering within each local DSQ. See pick_task_scx().
3147  */
3148 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
3149                    bool in_fi)
3150 {
3151         /*
3152          * The const qualifiers are dropped from task_struct pointers when
3153          * calling ops.core_sched_before(). Accesses are controlled by the
3154          * verifier.
3155          */
3156         if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
3157                 return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
3158                                               (struct task_struct *)a,
3159                                               (struct task_struct *)b);
3160         else
3161                 return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
3162 }
3163 #endif  /* CONFIG_SCHED_CORE */
3164
3165 #ifdef CONFIG_SMP
3166
3167 static bool test_and_clear_cpu_idle(int cpu)
3168 {
3169 #ifdef CONFIG_SCHED_SMT
3170         /*
3171          * SMT mask should be cleared whether we can claim @cpu or not. The SMT
3172          * cluster is not wholly idle either way. This also prevents
3173          * scx_pick_idle_cpu() from getting caught in an infinite loop.
3174          */
3175         if (sched_smt_active()) {
3176                 const struct cpumask *smt = cpu_smt_mask(cpu);
3177
3178                 /*
3179                  * If offline, @cpu is not its own sibling and
3180                  * scx_pick_idle_cpu() can get caught in an infinite loop as
3181                  * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
3182                  * is eventually cleared.
3183                  */
3184                 if (cpumask_intersects(smt, idle_masks.smt))
3185                         cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
3186                 else if (cpumask_test_cpu(cpu, idle_masks.smt))
3187                         __cpumask_clear_cpu(cpu, idle_masks.smt);
3188         }
3189 #endif
3190         return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
3191 }
3192
3193 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
3194 {
3195         int cpu;
3196
3197 retry:
3198         if (sched_smt_active()) {
3199                 cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
3200                 if (cpu < nr_cpu_ids)
3201                         goto found;
3202
3203                 if (flags & SCX_PICK_IDLE_CORE)
3204                         return -EBUSY;
3205         }
3206
3207         cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
3208         if (cpu >= nr_cpu_ids)
3209                 return -EBUSY;
3210
3211 found:
3212         if (test_and_clear_cpu_idle(cpu))
3213                 return cpu;
3214         else
3215                 goto retry;
3216 }
3217
3218 /*
3219  * Return true if the LLC domains do not perfectly overlap with the NUMA
3220  * domains, false otherwise.
3221  */
3222 static bool llc_numa_mismatch(void)
3223 {
3224         int cpu;
3225
3226         /*
3227          * We need to scan all online CPUs to verify whether their scheduling
3228          * domains overlap.
3229          *
3230          * While it is rare to encounter architectures with asymmetric NUMA
3231          * topologies, CPU hotplugging or virtualized environments can result
3232          * in asymmetric configurations.
3233          *
3234          * For example:
3235          *
3236          *  NUMA 0:
3237          *    - LLC 0: cpu0..cpu7
3238          *    - LLC 1: cpu8..cpu15 [offline]
3239          *
3240          *  NUMA 1:
3241          *    - LLC 0: cpu16..cpu23
3242          *    - LLC 1: cpu24..cpu31
3243          *
3244          * In this case, if we only check the first online CPU (cpu0), we might
3245          * incorrectly assume that the LLC and NUMA domains are fully
3246          * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
3247          * domains).
3248          */
3249         for_each_online_cpu(cpu) {
3250                 const struct cpumask *numa_cpus;
3251                 struct sched_domain *sd;
3252
3253                 sd = rcu_dereference(per_cpu(sd_llc, cpu));
3254                 if (!sd)
3255                         return true;
3256
3257                 numa_cpus = cpumask_of_node(cpu_to_node(cpu));
3258                 if (sd->span_weight != cpumask_weight(numa_cpus))
3259                         return true;
3260         }
3261
3262         return false;
3263 }
3264
3265 /*
3266  * Initialize topology-aware scheduling.
3267  *
3268  * Detect if the system has multiple LLC or multiple NUMA domains and enable
3269  * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
3270  * selection policy.
3271  *
3272  * Assumption: the kernel's internal topology representation assumes that each
3273  * CPU belongs to a single LLC domain, and that each LLC domain is entirely
3274  * contained within a single NUMA node.
3275  */
3276 static void update_selcpu_topology(void)
3277 {
3278         bool enable_llc = false, enable_numa = false;
3279         struct sched_domain *sd;
3280         const struct cpumask *cpus;
3281         s32 cpu = cpumask_first(cpu_online_mask);
3282
3283         /*
3284          * Enable LLC domain optimization only when there are multiple LLC
3285          * domains among the online CPUs. If all online CPUs are part of a
3286          * single LLC domain, the idle CPU selection logic can choose any
3287          * online CPU without bias.
3288          *
3289          * Note that it is sufficient to check the LLC domain of the first
3290          * online CPU to determine whether a single LLC domain includes all
3291          * CPUs.
3292          */
3293         rcu_read_lock();
3294         sd = rcu_dereference(per_cpu(sd_llc, cpu));
3295         if (sd) {
3296                 if (sd->span_weight < num_online_cpus())
3297                         enable_llc = true;
3298         }
3299
3300         /*
3301          * Enable NUMA optimization only when there are multiple NUMA domains
3302          * among the online CPUs and the NUMA domains don't perfectly overlaps
3303          * with the LLC domains.
3304          *
3305          * If all CPUs belong to the same NUMA node and the same LLC domain,
3306          * enabling both NUMA and LLC optimizations is unnecessary, as checking
3307          * for an idle CPU in the same domain twice is redundant.
3308          */
3309         cpus = cpumask_of_node(cpu_to_node(cpu));
3310         if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch())
3311                 enable_numa = true;
3312         rcu_read_unlock();
3313
3314         pr_debug("sched_ext: LLC idle selection %s\n",
3315                  enable_llc ? "enabled" : "disabled");
3316         pr_debug("sched_ext: NUMA idle selection %s\n",
3317                  enable_numa ? "enabled" : "disabled");
3318
3319         if (enable_llc)
3320                 static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
3321         else
3322                 static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
3323         if (enable_numa)
3324                 static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
3325         else
3326                 static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
3327 }
3328
3329 /*
3330  * Built-in CPU idle selection policy:
3331  *
3332  * 1. Prioritize full-idle cores:
3333  *   - always prioritize CPUs from fully idle cores (both logical CPUs are
3334  *     idle) to avoid interference caused by SMT.
3335  *
3336  * 2. Reuse the same CPU:
3337  *   - prefer the last used CPU to take advantage of cached data (L1, L2) and
3338  *     branch prediction optimizations.
3339  *
3340  * 3. Pick a CPU within the same LLC (Last-Level Cache):
3341  *   - if the above conditions aren't met, pick a CPU that shares the same LLC
3342  *     to maintain cache locality.
3343  *
3344  * 4. Pick a CPU within the same NUMA node, if enabled:
3345  *   - choose a CPU from the same NUMA node to reduce memory access latency.
3346  *
3347  * Step 3 and 4 are performed only if the system has, respectively, multiple
3348  * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
3349  * scx_selcpu_topo_numa).
3350  *
3351  * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
3352  * we never call ops.select_cpu() for them, see select_task_rq().
3353  */
3354 static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
3355                               u64 wake_flags, bool *found)
3356 {
3357         const struct cpumask *llc_cpus = NULL;
3358         const struct cpumask *numa_cpus = NULL;
3359         s32 cpu;
3360
3361         *found = false;
3362
3363
3364         /*
3365          * This is necessary to protect llc_cpus.
3366          */
3367         rcu_read_lock();
3368
3369         /*
3370          * Determine the scheduling domain only if the task is allowed to run
3371          * on all CPUs.
3372          *
3373          * This is done primarily for efficiency, as it avoids the overhead of
3374          * updating a cpumask every time we need to select an idle CPU (which
3375          * can be costly in large SMP systems), but it also aligns logically:
3376          * if a task's scheduling domain is restricted by user-space (through
3377          * CPU affinity), the task will simply use the flat scheduling domain
3378          * defined by user-space.
3379          */
3380         if (p->nr_cpus_allowed >= num_possible_cpus()) {
3381                 if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
3382                         numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu));
3383
3384                 if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
3385                         struct sched_domain *sd;
3386
3387                         sd = rcu_dereference(per_cpu(sd_llc, prev_cpu));
3388                         if (sd)
3389                                 llc_cpus = sched_domain_span(sd);
3390                 }
3391         }
3392
3393         /*
3394          * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
3395          */
3396         if (wake_flags & SCX_WAKE_SYNC) {
3397                 cpu = smp_processor_id();
3398
3399                 /*
3400                  * If the waker's CPU is cache affine and prev_cpu is idle,
3401                  * then avoid a migration.
3402                  */
3403                 if (cpus_share_cache(cpu, prev_cpu) &&
3404                     test_and_clear_cpu_idle(prev_cpu)) {
3405                         cpu = prev_cpu;
3406                         goto cpu_found;
3407                 }
3408
3409                 /*
3410                  * If the waker's local DSQ is empty, and the system is under
3411                  * utilized, try to wake up @p to the local DSQ of the waker.
3412                  *
3413                  * Checking only for an empty local DSQ is insufficient as it
3414                  * could give the wakee an unfair advantage when the system is
3415                  * oversaturated.
3416                  *
3417                  * Checking only for the presence of idle CPUs is also
3418                  * insufficient as the local DSQ of the waker could have tasks
3419                  * piled up on it even if there is an idle core elsewhere on
3420                  * the system.
3421                  */
3422                 if (!cpumask_empty(idle_masks.cpu) &&
3423                     !(current->flags & PF_EXITING) &&
3424                     cpu_rq(cpu)->scx.local_dsq.nr == 0) {
3425                         if (cpumask_test_cpu(cpu, p->cpus_ptr))
3426                                 goto cpu_found;
3427                 }
3428         }
3429
3430         /*
3431          * If CPU has SMT, any wholly idle CPU is likely a better pick than
3432          * partially idle @prev_cpu.
3433          */
3434         if (sched_smt_active()) {
3435                 /*
3436                  * Keep using @prev_cpu if it's part of a fully idle core.
3437                  */
3438                 if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
3439                     test_and_clear_cpu_idle(prev_cpu)) {
3440                         cpu = prev_cpu;
3441                         goto cpu_found;
3442                 }
3443
3444                 /*
3445                  * Search for any fully idle core in the same LLC domain.
3446                  */
3447                 if (llc_cpus) {
3448                         cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
3449                         if (cpu >= 0)
3450                                 goto cpu_found;
3451                 }
3452
3453                 /*
3454                  * Search for any fully idle core in the same NUMA node.
3455                  */
3456                 if (numa_cpus) {
3457                         cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
3458                         if (cpu >= 0)
3459                                 goto cpu_found;
3460                 }
3461
3462                 /*
3463                  * Search for any full idle core usable by the task.
3464                  */
3465                 cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
3466                 if (cpu >= 0)
3467                         goto cpu_found;
3468         }
3469
3470         /*
3471          * Use @prev_cpu if it's idle.
3472          */
3473         if (test_and_clear_cpu_idle(prev_cpu)) {
3474                 cpu = prev_cpu;
3475                 goto cpu_found;
3476         }
3477
3478         /*
3479          * Search for any idle CPU in the same LLC domain.
3480          */
3481         if (llc_cpus) {
3482                 cpu = scx_pick_idle_cpu(llc_cpus, 0);
3483                 if (cpu >= 0)
3484                         goto cpu_found;
3485         }
3486
3487         /*
3488          * Search for any idle CPU in the same NUMA node.
3489          */
3490         if (numa_cpus) {
3491                 cpu = scx_pick_idle_cpu(numa_cpus, 0);
3492                 if (cpu >= 0)
3493                         goto cpu_found;
3494         }
3495
3496         /*
3497          * Search for any idle CPU usable by the task.
3498          */
3499         cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
3500         if (cpu >= 0)
3501                 goto cpu_found;
3502
3503         rcu_read_unlock();
3504         return prev_cpu;
3505
3506 cpu_found:
3507         rcu_read_unlock();
3508
3509         *found = true;
3510         return cpu;
3511 }
3512
3513 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
3514 {
3515         /*
3516          * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
3517          * can be a good migration opportunity with low cache and memory
3518          * footprint. Returning a CPU different than @prev_cpu triggers
3519          * immediate rq migration. However, for SCX, as the current rq
3520          * association doesn't dictate where the task is going to run, this
3521          * doesn't fit well. If necessary, we can later add a dedicated method
3522          * which can decide to preempt self to force it through the regular
3523          * scheduling path.
3524          */
3525         if (unlikely(wake_flags & WF_EXEC))
3526                 return prev_cpu;
3527
3528         if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
3529                 s32 cpu;
3530                 struct task_struct **ddsp_taskp;
3531
3532                 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
3533                 WARN_ON_ONCE(*ddsp_taskp);
3534                 *ddsp_taskp = p;
3535
3536                 cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
3537                                            select_cpu, p, prev_cpu, wake_flags);
3538                 *ddsp_taskp = NULL;
3539                 if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
3540                         return cpu;
3541                 else
3542                         return prev_cpu;
3543         } else {
3544                 bool found;
3545                 s32 cpu;
3546
3547                 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
3548                 if (found) {
3549                         p->scx.slice = SCX_SLICE_DFL;
3550                         p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
3551                 }
3552                 return cpu;
3553         }
3554 }
3555
3556 static void task_woken_scx(struct rq *rq, struct task_struct *p)
3557 {
3558         run_deferred(rq);
3559 }
3560
3561 static void set_cpus_allowed_scx(struct task_struct *p,
3562                                  struct affinity_context *ac)
3563 {
3564         set_cpus_allowed_common(p, ac);
3565
3566         /*
3567          * The effective cpumask is stored in @p->cpus_ptr which may temporarily
3568          * differ from the configured one in @p->cpus_mask. Always tell the bpf
3569          * scheduler the effective one.
3570          *
3571          * Fine-grained memory write control is enforced by BPF making the const
3572          * designation pointless. Cast it away when calling the operation.
3573          */
3574         if (SCX_HAS_OP(set_cpumask))
3575                 SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
3576                                  (struct cpumask *)p->cpus_ptr);
3577 }
3578
3579 static void reset_idle_masks(void)
3580 {
3581         /*
3582          * Consider all online cpus idle. Should converge to the actual state
3583          * quickly.
3584          */
3585         cpumask_copy(idle_masks.cpu, cpu_online_mask);
3586         cpumask_copy(idle_masks.smt, cpu_online_mask);
3587 }
3588
3589 void __scx_update_idle(struct rq *rq, bool idle)
3590 {
3591         int cpu = cpu_of(rq);
3592
3593         if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
3594                 SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
3595                 if (!static_branch_unlikely(&scx_builtin_idle_enabled))
3596                         return;
3597         }
3598
3599         if (idle)
3600                 cpumask_set_cpu(cpu, idle_masks.cpu);
3601         else
3602                 cpumask_clear_cpu(cpu, idle_masks.cpu);
3603
3604 #ifdef CONFIG_SCHED_SMT
3605         if (sched_smt_active()) {
3606                 const struct cpumask *smt = cpu_smt_mask(cpu);
3607
3608                 if (idle) {
3609                         /*
3610                          * idle_masks.smt handling is racy but that's fine as
3611                          * it's only for optimization and self-correcting.
3612                          */
3613                         for_each_cpu(cpu, smt) {
3614                                 if (!cpumask_test_cpu(cpu, idle_masks.cpu))
3615                                         return;
3616                         }
3617                         cpumask_or(idle_masks.smt, idle_masks.smt, smt);
3618                 } else {
3619                         cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
3620                 }
3621         }
3622 #endif
3623 }
3624
3625 static void handle_hotplug(struct rq *rq, bool online)
3626 {
3627         int cpu = cpu_of(rq);
3628
3629         atomic_long_inc(&scx_hotplug_seq);
3630
3631         if (scx_enabled())
3632                 update_selcpu_topology();
3633
3634         if (online && SCX_HAS_OP(cpu_online))
3635                 SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
3636         else if (!online && SCX_HAS_OP(cpu_offline))
3637                 SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
3638         else
3639                 scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
3640                              "cpu %d going %s, exiting scheduler", cpu,
3641                              online ? "online" : "offline");
3642 }
3643
3644 void scx_rq_activate(struct rq *rq)
3645 {
3646         handle_hotplug(rq, true);
3647 }
3648
3649 void scx_rq_deactivate(struct rq *rq)
3650 {
3651         handle_hotplug(rq, false);
3652 }
3653
3654 static void rq_online_scx(struct rq *rq)
3655 {
3656         rq->scx.flags |= SCX_RQ_ONLINE;
3657 }
3658
3659 static void rq_offline_scx(struct rq *rq)
3660 {
3661         rq->scx.flags &= ~SCX_RQ_ONLINE;
3662 }
3663
3664 #else   /* CONFIG_SMP */
3665
3666 static bool test_and_clear_cpu_idle(int cpu) { return false; }
3667 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
3668 static void reset_idle_masks(void) {}
3669
3670 #endif  /* CONFIG_SMP */
3671
3672 static bool check_rq_for_timeouts(struct rq *rq)
3673 {
3674         struct task_struct *p;
3675         struct rq_flags rf;
3676         bool timed_out = false;
3677
3678         rq_lock_irqsave(rq, &rf);
3679         list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
3680                 unsigned long last_runnable = p->scx.runnable_at;
3681
3682                 if (unlikely(time_after(jiffies,
3683                                         last_runnable + scx_watchdog_timeout))) {
3684                         u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
3685
3686                         scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
3687                                            "%s[%d] failed to run for %u.%03us",
3688                                            p->comm, p->pid,
3689                                            dur_ms / 1000, dur_ms % 1000);
3690                         timed_out = true;
3691                         break;
3692                 }
3693         }
3694         rq_unlock_irqrestore(rq, &rf);
3695
3696         return timed_out;
3697 }
3698
3699 static void scx_watchdog_workfn(struct work_struct *work)
3700 {
3701         int cpu;
3702
3703         WRITE_ONCE(scx_watchdog_timestamp, jiffies);
3704
3705         for_each_online_cpu(cpu) {
3706                 if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
3707                         break;
3708
3709                 cond_resched();
3710         }
3711         queue_delayed_work(system_unbound_wq, to_delayed_work(work),
3712                            scx_watchdog_timeout / 2);
3713 }
3714
3715 void scx_tick(struct rq *rq)
3716 {
3717         unsigned long last_check;
3718
3719         if (!scx_enabled())
3720                 return;
3721
3722         last_check = READ_ONCE(scx_watchdog_timestamp);
3723         if (unlikely(time_after(jiffies,
3724                                 last_check + READ_ONCE(scx_watchdog_timeout)))) {
3725                 u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
3726
3727                 scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
3728                                    "watchdog failed to check in for %u.%03us",
3729                                    dur_ms / 1000, dur_ms % 1000);
3730         }
3731
3732         update_other_load_avgs(rq);
3733 }
3734
3735 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
3736 {
3737         update_curr_scx(rq);
3738
3739         /*
3740          * While disabling, always resched and refresh core-sched timestamp as
3741          * we can't trust the slice management or ops.core_sched_before().
3742          */
3743         if (scx_rq_bypassing(rq)) {
3744                 curr->scx.slice = 0;
3745                 touch_core_sched(rq, curr);
3746         } else if (SCX_HAS_OP(tick)) {
3747                 SCX_CALL_OP(SCX_KF_REST, tick, curr);
3748         }
3749
3750         if (!curr->scx.slice)
3751                 resched_curr(rq);
3752 }
3753
3754 #ifdef CONFIG_EXT_GROUP_SCHED
3755 static struct cgroup *tg_cgrp(struct task_group *tg)
3756 {
3757         /*
3758          * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
3759          * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
3760          * root cgroup.
3761          */
3762         if (tg && tg->css.cgroup)
3763                 return tg->css.cgroup;
3764         else
3765                 return &cgrp_dfl_root.cgrp;
3766 }
3767
3768 #define SCX_INIT_TASK_ARGS_CGROUP(tg)           .cgroup = tg_cgrp(tg),
3769
3770 #else   /* CONFIG_EXT_GROUP_SCHED */
3771
3772 #define SCX_INIT_TASK_ARGS_CGROUP(tg)
3773
3774 #endif  /* CONFIG_EXT_GROUP_SCHED */
3775
3776 static enum scx_task_state scx_get_task_state(const struct task_struct *p)
3777 {
3778         return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
3779 }
3780
3781 static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
3782 {
3783         enum scx_task_state prev_state = scx_get_task_state(p);
3784         bool warn = false;
3785
3786         BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS));
3787
3788         switch (state) {
3789         case SCX_TASK_NONE:
3790                 break;
3791         case SCX_TASK_INIT:
3792                 warn = prev_state != SCX_TASK_NONE;
3793                 break;
3794         case SCX_TASK_READY:
3795                 warn = prev_state == SCX_TASK_NONE;
3796                 break;
3797         case SCX_TASK_ENABLED:
3798                 warn = prev_state != SCX_TASK_READY;
3799                 break;
3800         default:
3801                 warn = true;
3802                 return;
3803         }
3804
3805         WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",
3806                   prev_state, state, p->comm, p->pid);
3807
3808         p->scx.flags &= ~SCX_TASK_STATE_MASK;
3809         p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
3810 }
3811
3812 static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork)
3813 {
3814         int ret;
3815
3816         p->scx.disallow = false;
3817
3818         if (SCX_HAS_OP(init_task)) {
3819                 struct scx_init_task_args args = {
3820                         SCX_INIT_TASK_ARGS_CGROUP(tg)
3821                         .fork = fork,
3822                 };
3823
3824                 ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
3825                 if (unlikely(ret)) {
3826                         ret = ops_sanitize_err("init_task", ret);
3827                         return ret;
3828                 }
3829         }
3830
3831         scx_set_task_state(p, SCX_TASK_INIT);
3832
3833         if (p->scx.disallow) {
3834                 if (!fork) {
3835                         struct rq *rq;
3836                         struct rq_flags rf;
3837
3838                         rq = task_rq_lock(p, &rf);
3839
3840                         /*
3841                          * We're in the load path and @p->policy will be applied
3842                          * right after. Reverting @p->policy here and rejecting
3843                          * %SCHED_EXT transitions from scx_check_setscheduler()
3844                          * guarantees that if ops.init_task() sets @p->disallow,
3845                          * @p can never be in SCX.
3846                          */
3847                         if (p->policy == SCHED_EXT) {
3848                                 p->policy = SCHED_NORMAL;
3849                                 atomic_long_inc(&scx_nr_rejected);
3850                         }
3851
3852                         task_rq_unlock(rq, p, &rf);
3853                 } else if (p->policy == SCHED_EXT) {
3854                         scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",
3855                                       p->comm, p->pid);
3856                 }
3857         }
3858
3859         p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
3860         return 0;
3861 }
3862
3863 static void scx_ops_enable_task(struct task_struct *p)
3864 {
3865         u32 weight;
3866
3867         lockdep_assert_rq_held(task_rq(p));
3868
3869         /*
3870          * Set the weight before calling ops.enable() so that the scheduler
3871          * doesn't see a stale value if they inspect the task struct.
3872          */
3873         if (task_has_idle_policy(p))
3874                 weight = WEIGHT_IDLEPRIO;
3875         else
3876                 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
3877
3878         p->scx.weight = sched_weight_to_cgroup(weight);
3879
3880         if (SCX_HAS_OP(enable))
3881                 SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
3882         scx_set_task_state(p, SCX_TASK_ENABLED);
3883
3884         if (SCX_HAS_OP(set_weight))
3885                 SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
3886 }
3887
3888 static void scx_ops_disable_task(struct task_struct *p)
3889 {
3890         lockdep_assert_rq_held(task_rq(p));
3891         WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
3892
3893         if (SCX_HAS_OP(disable))
3894                 SCX_CALL_OP(SCX_KF_REST, disable, p);
3895         scx_set_task_state(p, SCX_TASK_READY);
3896 }
3897
3898 static void scx_ops_exit_task(struct task_struct *p)
3899 {
3900         struct scx_exit_task_args args = {
3901                 .cancelled = false,
3902         };
3903
3904         lockdep_assert_rq_held(task_rq(p));
3905
3906         switch (scx_get_task_state(p)) {
3907         case SCX_TASK_NONE:
3908                 return;
3909         case SCX_TASK_INIT:
3910                 args.cancelled = true;
3911                 break;
3912         case SCX_TASK_READY:
3913                 break;
3914         case SCX_TASK_ENABLED:
3915                 scx_ops_disable_task(p);
3916                 break;
3917         default:
3918                 WARN_ON_ONCE(true);
3919                 return;
3920         }
3921
3922         if (SCX_HAS_OP(exit_task))
3923                 SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
3924         scx_set_task_state(p, SCX_TASK_NONE);
3925 }
3926
3927 void init_scx_entity(struct sched_ext_entity *scx)
3928 {
3929         memset(scx, 0, sizeof(*scx));
3930         INIT_LIST_HEAD(&scx->dsq_list.node);
3931         RB_CLEAR_NODE(&scx->dsq_priq);
3932         scx->sticky_cpu = -1;
3933         scx->holding_cpu = -1;
3934         INIT_LIST_HEAD(&scx->runnable_node);
3935         scx->runnable_at = jiffies;
3936         scx->ddsp_dsq_id = SCX_DSQ_INVALID;
3937         scx->slice = SCX_SLICE_DFL;
3938 }
3939
3940 void scx_pre_fork(struct task_struct *p)
3941 {
3942         /*
3943          * BPF scheduler enable/disable paths want to be able to iterate and
3944          * update all tasks which can become complex when racing forks. As
3945          * enable/disable are very cold paths, let's use a percpu_rwsem to
3946          * exclude forks.
3947          */
3948         percpu_down_read(&scx_fork_rwsem);
3949 }
3950
3951 int scx_fork(struct task_struct *p)
3952 {
3953         percpu_rwsem_assert_held(&scx_fork_rwsem);
3954
3955         if (scx_ops_init_task_enabled)
3956                 return scx_ops_init_task(p, task_group(p), true);
3957         else
3958                 return 0;
3959 }
3960
3961 void scx_post_fork(struct task_struct *p)
3962 {
3963         if (scx_ops_init_task_enabled) {
3964                 scx_set_task_state(p, SCX_TASK_READY);
3965
3966                 /*
3967                  * Enable the task immediately if it's running on sched_ext.
3968                  * Otherwise, it'll be enabled in switching_to_scx() if and
3969                  * when it's ever configured to run with a SCHED_EXT policy.
3970                  */
3971                 if (p->sched_class == &ext_sched_class) {
3972                         struct rq_flags rf;
3973                         struct rq *rq;
3974
3975                         rq = task_rq_lock(p, &rf);
3976                         scx_ops_enable_task(p);
3977                         task_rq_unlock(rq, p, &rf);
3978                 }
3979         }
3980
3981         spin_lock_irq(&scx_tasks_lock);
3982         list_add_tail(&p->scx.tasks_node, &scx_tasks);
3983         spin_unlock_irq(&scx_tasks_lock);
3984
3985         percpu_up_read(&scx_fork_rwsem);
3986 }
3987
3988 void scx_cancel_fork(struct task_struct *p)
3989 {
3990         if (scx_enabled()) {
3991                 struct rq *rq;
3992                 struct rq_flags rf;
3993
3994                 rq = task_rq_lock(p, &rf);
3995                 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
3996                 scx_ops_exit_task(p);
3997                 task_rq_unlock(rq, p, &rf);
3998         }
3999
4000         percpu_up_read(&scx_fork_rwsem);
4001 }
4002
4003 void sched_ext_free(struct task_struct *p)
4004 {
4005         unsigned long flags;
4006
4007         spin_lock_irqsave(&scx_tasks_lock, flags);
4008         list_del_init(&p->scx.tasks_node);
4009         spin_unlock_irqrestore(&scx_tasks_lock, flags);
4010
4011         /*
4012          * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
4013          * ENABLED transitions can't race us. Disable ops for @p.
4014          */
4015         if (scx_get_task_state(p) != SCX_TASK_NONE) {
4016                 struct rq_flags rf;
4017                 struct rq *rq;
4018
4019                 rq = task_rq_lock(p, &rf);
4020                 scx_ops_exit_task(p);
4021                 task_rq_unlock(rq, p, &rf);
4022         }
4023 }
4024
4025 static void reweight_task_scx(struct rq *rq, struct task_struct *p,
4026                               const struct load_weight *lw)
4027 {
4028         lockdep_assert_rq_held(task_rq(p));
4029
4030         p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
4031         if (SCX_HAS_OP(set_weight))
4032                 SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
4033 }
4034
4035 static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
4036 {
4037 }
4038
4039 static void switching_to_scx(struct rq *rq, struct task_struct *p)
4040 {
4041         scx_ops_enable_task(p);
4042
4043         /*
4044          * set_cpus_allowed_scx() is not called while @p is associated with a
4045          * different scheduler class. Keep the BPF scheduler up-to-date.
4046          */
4047         if (SCX_HAS_OP(set_cpumask))
4048                 SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
4049                                  (struct cpumask *)p->cpus_ptr);
4050 }
4051
4052 static void switched_from_scx(struct rq *rq, struct task_struct *p)
4053 {
4054         scx_ops_disable_task(p);
4055 }
4056
4057 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
4058 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
4059
4060 int scx_check_setscheduler(struct task_struct *p, int policy)
4061 {
4062         lockdep_assert_rq_held(task_rq(p));
4063
4064         /* if disallow, reject transitioning into SCX */
4065         if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
4066             p->policy != policy && policy == SCHED_EXT)
4067                 return -EACCES;
4068
4069         return 0;
4070 }
4071
4072 #ifdef CONFIG_NO_HZ_FULL
4073 bool scx_can_stop_tick(struct rq *rq)
4074 {
4075         struct task_struct *p = rq->curr;
4076
4077         if (scx_rq_bypassing(rq))
4078                 return false;
4079
4080         if (p->sched_class != &ext_sched_class)
4081                 return true;
4082
4083         /*
4084          * @rq can dispatch from different DSQs, so we can't tell whether it
4085          * needs the tick or not by looking at nr_running. Allow stopping ticks
4086          * iff the BPF scheduler indicated so. See set_next_task_scx().
4087          */
4088         return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
4089 }
4090 #endif
4091
4092 #ifdef CONFIG_EXT_GROUP_SCHED
4093
4094 DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
4095 static bool scx_cgroup_enabled;
4096 static bool cgroup_warned_missing_weight;
4097 static bool cgroup_warned_missing_idle;
4098
4099 static void scx_cgroup_warn_missing_weight(struct task_group *tg)
4100 {
4101         if (scx_ops_enable_state() == SCX_OPS_DISABLED ||
4102             cgroup_warned_missing_weight)
4103                 return;
4104
4105         if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent)
4106                 return;
4107
4108         pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n",
4109                 scx_ops.name);
4110         cgroup_warned_missing_weight = true;
4111 }
4112
4113 static void scx_cgroup_warn_missing_idle(struct task_group *tg)
4114 {
4115         if (!scx_cgroup_enabled || cgroup_warned_missing_idle)
4116                 return;
4117
4118         if (!tg->idle)
4119                 return;
4120
4121         pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n",
4122                 scx_ops.name);
4123         cgroup_warned_missing_idle = true;
4124 }
4125
4126 int scx_tg_online(struct task_group *tg)
4127 {
4128         int ret = 0;
4129
4130         WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
4131
4132         percpu_down_read(&scx_cgroup_rwsem);
4133
4134         scx_cgroup_warn_missing_weight(tg);
4135
4136         if (scx_cgroup_enabled) {
4137                 if (SCX_HAS_OP(cgroup_init)) {
4138                         struct scx_cgroup_init_args args =
4139                                 { .weight = tg->scx_weight };
4140
4141                         ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
4142                                               tg->css.cgroup, &args);
4143                         if (ret)
4144                                 ret = ops_sanitize_err("cgroup_init", ret);
4145                 }
4146                 if (ret == 0)
4147                         tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
4148         } else {
4149                 tg->scx_flags |= SCX_TG_ONLINE;
4150         }
4151
4152         percpu_up_read(&scx_cgroup_rwsem);
4153         return ret;
4154 }
4155
4156 void scx_tg_offline(struct task_group *tg)
4157 {
4158         WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
4159
4160         percpu_down_read(&scx_cgroup_rwsem);
4161
4162         if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
4163                 SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
4164         tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
4165
4166         percpu_up_read(&scx_cgroup_rwsem);
4167 }
4168
4169 int scx_cgroup_can_attach(struct cgroup_taskset *tset)
4170 {
4171         struct cgroup_subsys_state *css;
4172         struct task_struct *p;
4173         int ret;
4174
4175         /* released in scx_finish/cancel_attach() */
4176         percpu_down_read(&scx_cgroup_rwsem);
4177
4178         if (!scx_cgroup_enabled)
4179                 return 0;
4180
4181         cgroup_taskset_for_each(p, css, tset) {
4182                 struct cgroup *from = tg_cgrp(task_group(p));
4183                 struct cgroup *to = tg_cgrp(css_tg(css));
4184
4185                 WARN_ON_ONCE(p->scx.cgrp_moving_from);
4186
4187                 /*
4188                  * sched_move_task() omits identity migrations. Let's match the
4189                  * behavior so that ops.cgroup_prep_move() and ops.cgroup_move()
4190                  * always match one-to-one.
4191                  */
4192                 if (from == to)
4193                         continue;
4194
4195                 if (SCX_HAS_OP(cgroup_prep_move)) {
4196                         ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
4197                                               p, from, css->cgroup);
4198                         if (ret)
4199                                 goto err;
4200                 }
4201
4202                 p->scx.cgrp_moving_from = from;
4203         }
4204
4205         return 0;
4206
4207 err:
4208         cgroup_taskset_for_each(p, css, tset) {
4209                 if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
4210                         SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
4211                                     p->scx.cgrp_moving_from, css->cgroup);
4212                 p->scx.cgrp_moving_from = NULL;
4213         }
4214
4215         percpu_up_read(&scx_cgroup_rwsem);
4216         return ops_sanitize_err("cgroup_prep_move", ret);
4217 }
4218
4219 void scx_move_task(struct task_struct *p)
4220 {
4221         if (!scx_cgroup_enabled)
4222                 return;
4223
4224         /*
4225          * We're called from sched_move_task() which handles both cgroup and
4226          * autogroup moves. Ignore the latter.
4227          *
4228          * Also ignore exiting tasks, because in the exit path tasks transition
4229          * from the autogroup to the root group, so task_group_is_autogroup()
4230          * alone isn't able to catch exiting autogroup tasks. This is safe for
4231          * cgroup_move(), because cgroup migrations never happen for PF_EXITING
4232          * tasks.
4233          */
4234         if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))
4235                 return;
4236
4237         /*
4238          * @p must have ops.cgroup_prep_move() called on it and thus
4239          * cgrp_moving_from set.
4240          */
4241         if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
4242                 SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
4243                         p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
4244         p->scx.cgrp_moving_from = NULL;
4245 }
4246
4247 void scx_cgroup_finish_attach(void)
4248 {
4249         percpu_up_read(&scx_cgroup_rwsem);
4250 }
4251
4252 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
4253 {
4254         struct cgroup_subsys_state *css;
4255         struct task_struct *p;
4256
4257         if (!scx_cgroup_enabled)
4258                 goto out_unlock;
4259
4260         cgroup_taskset_for_each(p, css, tset) {
4261                 if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
4262                         SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
4263                                     p->scx.cgrp_moving_from, css->cgroup);
4264                 p->scx.cgrp_moving_from = NULL;
4265         }
4266 out_unlock:
4267         percpu_up_read(&scx_cgroup_rwsem);
4268 }
4269
4270 void scx_group_set_weight(struct task_group *tg, unsigned long weight)
4271 {
4272         percpu_down_read(&scx_cgroup_rwsem);
4273
4274         if (scx_cgroup_enabled && tg->scx_weight != weight) {
4275                 if (SCX_HAS_OP(cgroup_set_weight))
4276                         SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
4277                                     tg_cgrp(tg), weight);
4278                 tg->scx_weight = weight;
4279         }
4280
4281         percpu_up_read(&scx_cgroup_rwsem);
4282 }
4283
4284 void scx_group_set_idle(struct task_group *tg, bool idle)
4285 {
4286         percpu_down_read(&scx_cgroup_rwsem);
4287         scx_cgroup_warn_missing_idle(tg);
4288         percpu_up_read(&scx_cgroup_rwsem);
4289 }
4290
4291 static void scx_cgroup_lock(void)
4292 {
4293         percpu_down_write(&scx_cgroup_rwsem);
4294 }
4295
4296 static void scx_cgroup_unlock(void)
4297 {
4298         percpu_up_write(&scx_cgroup_rwsem);
4299 }
4300
4301 #else   /* CONFIG_EXT_GROUP_SCHED */
4302
4303 static inline void scx_cgroup_lock(void) {}
4304 static inline void scx_cgroup_unlock(void) {}
4305
4306 #endif  /* CONFIG_EXT_GROUP_SCHED */
4307
4308 /*
4309  * Omitted operations:
4310  *
4311  * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
4312  *   isn't tied to the CPU at that point. Preemption is implemented by resetting
4313  *   the victim task's slice to 0 and triggering reschedule on the target CPU.
4314  *
4315  * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
4316  *
4317  * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
4318  *   their current sched_class. Call them directly from sched core instead.
4319  */
4320 DEFINE_SCHED_CLASS(ext) = {
4321         .enqueue_task           = enqueue_task_scx,
4322         .dequeue_task           = dequeue_task_scx,
4323         .yield_task             = yield_task_scx,
4324         .yield_to_task          = yield_to_task_scx,
4325
4326         .wakeup_preempt         = wakeup_preempt_scx,
4327
4328         .balance                = balance_scx,
4329         .pick_task              = pick_task_scx,
4330
4331         .put_prev_task          = put_prev_task_scx,
4332         .set_next_task          = set_next_task_scx,
4333
4334 #ifdef CONFIG_SMP
4335         .select_task_rq         = select_task_rq_scx,
4336         .task_woken             = task_woken_scx,
4337         .set_cpus_allowed       = set_cpus_allowed_scx,
4338
4339         .rq_online              = rq_online_scx,
4340         .rq_offline             = rq_offline_scx,
4341 #endif
4342
4343         .task_tick              = task_tick_scx,
4344
4345         .switching_to           = switching_to_scx,
4346         .switched_from          = switched_from_scx,
4347         .switched_to            = switched_to_scx,
4348         .reweight_task          = reweight_task_scx,
4349         .prio_changed           = prio_changed_scx,
4350
4351         .update_curr            = update_curr_scx,
4352
4353 #ifdef CONFIG_UCLAMP_TASK
4354         .uclamp_enabled         = 1,
4355 #endif
4356 };
4357
4358 static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
4359 {
4360         memset(dsq, 0, sizeof(*dsq));
4361
4362         raw_spin_lock_init(&dsq->lock);
4363         INIT_LIST_HEAD(&dsq->list);
4364         dsq->id = dsq_id;
4365 }
4366
4367 static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
4368 {
4369         struct scx_dispatch_q *dsq;
4370         int ret;
4371
4372         if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
4373                 return ERR_PTR(-EINVAL);
4374
4375         dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
4376         if (!dsq)
4377                 return ERR_PTR(-ENOMEM);
4378
4379         init_dsq(dsq, dsq_id);
4380
4381         ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node,
4382                                      dsq_hash_params);
4383         if (ret) {
4384                 kfree(dsq);
4385                 return ERR_PTR(ret);
4386         }
4387         return dsq;
4388 }
4389
4390 static void free_dsq_irq_workfn(struct irq_work *irq_work)
4391 {
4392         struct llist_node *to_free = llist_del_all(&dsqs_to_free);
4393         struct scx_dispatch_q *dsq, *tmp_dsq;
4394
4395         llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
4396                 kfree_rcu(dsq, rcu);
4397 }
4398
4399 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
4400
4401 static void destroy_dsq(u64 dsq_id)
4402 {
4403         struct scx_dispatch_q *dsq;
4404         unsigned long flags;
4405
4406         rcu_read_lock();
4407
4408         dsq = find_user_dsq(dsq_id);
4409         if (!dsq)
4410                 goto out_unlock_rcu;
4411
4412         raw_spin_lock_irqsave(&dsq->lock, flags);
4413
4414         if (dsq->nr) {
4415                 scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
4416                               dsq->id, dsq->nr);
4417                 goto out_unlock_dsq;
4418         }
4419
4420         if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
4421                 goto out_unlock_dsq;
4422
4423         /*
4424          * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
4425          * queueing more tasks. As this function can be called from anywhere,
4426          * freeing is bounced through an irq work to avoid nesting RCU
4427          * operations inside scheduler locks.
4428          */
4429         dsq->id = SCX_DSQ_INVALID;
4430         llist_add(&dsq->free_node, &dsqs_to_free);
4431         irq_work_queue(&free_dsq_irq_work);
4432
4433 out_unlock_dsq:
4434         raw_spin_unlock_irqrestore(&dsq->lock, flags);
4435 out_unlock_rcu:
4436         rcu_read_unlock();
4437 }
4438
4439 #ifdef CONFIG_EXT_GROUP_SCHED
4440 static void scx_cgroup_exit(void)
4441 {
4442         struct cgroup_subsys_state *css;
4443
4444         percpu_rwsem_assert_held(&scx_cgroup_rwsem);
4445
4446         scx_cgroup_enabled = false;
4447
4448         /*
4449          * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
4450          * cgroups and exit all the inited ones, all online cgroups are exited.
4451          */
4452         rcu_read_lock();
4453         css_for_each_descendant_post(css, &root_task_group.css) {
4454                 struct task_group *tg = css_tg(css);
4455
4456                 if (!(tg->scx_flags & SCX_TG_INITED))
4457                         continue;
4458                 tg->scx_flags &= ~SCX_TG_INITED;
4459
4460                 if (!scx_ops.cgroup_exit)
4461                         continue;
4462
4463                 if (WARN_ON_ONCE(!css_tryget(css)))
4464                         continue;
4465                 rcu_read_unlock();
4466
4467                 SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
4468
4469                 rcu_read_lock();
4470                 css_put(css);
4471         }
4472         rcu_read_unlock();
4473 }
4474
4475 static int scx_cgroup_init(void)
4476 {
4477         struct cgroup_subsys_state *css;
4478         int ret;
4479
4480         percpu_rwsem_assert_held(&scx_cgroup_rwsem);
4481
4482         cgroup_warned_missing_weight = false;
4483         cgroup_warned_missing_idle = false;
4484
4485         /*
4486          * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
4487          * cgroups and init, all online cgroups are initialized.
4488          */
4489         rcu_read_lock();
4490         css_for_each_descendant_pre(css, &root_task_group.css) {
4491                 struct task_group *tg = css_tg(css);
4492                 struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
4493
4494                 scx_cgroup_warn_missing_weight(tg);
4495                 scx_cgroup_warn_missing_idle(tg);
4496
4497                 if ((tg->scx_flags &
4498                      (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
4499                         continue;
4500
4501                 if (!scx_ops.cgroup_init) {
4502                         tg->scx_flags |= SCX_TG_INITED;
4503                         continue;
4504                 }
4505
4506                 if (WARN_ON_ONCE(!css_tryget(css)))
4507                         continue;
4508                 rcu_read_unlock();
4509
4510                 ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
4511                                       css->cgroup, &args);
4512                 if (ret) {
4513                         css_put(css);
4514                         scx_ops_error("ops.cgroup_init() failed (%d)", ret);
4515                         return ret;
4516                 }
4517                 tg->scx_flags |= SCX_TG_INITED;
4518
4519                 rcu_read_lock();
4520                 css_put(css);
4521         }
4522         rcu_read_unlock();
4523
4524         WARN_ON_ONCE(scx_cgroup_enabled);
4525         scx_cgroup_enabled = true;
4526
4527         return 0;
4528 }
4529
4530 #else
4531 static void scx_cgroup_exit(void) {}
4532 static int scx_cgroup_init(void) { return 0; }
4533 #endif
4534
4535
4536 /********************************************************************************
4537  * Sysfs interface and ops enable/disable.
4538  */
4539
4540 #define SCX_ATTR(_name)                                                         \
4541         static struct kobj_attribute scx_attr_##_name = {                       \
4542                 .attr = { .name = __stringify(_name), .mode = 0444 },           \
4543                 .show = scx_attr_##_name##_show,                                \
4544         }
4545
4546 static ssize_t scx_attr_state_show(struct kobject *kobj,
4547                                    struct kobj_attribute *ka, char *buf)
4548 {
4549         return sysfs_emit(buf, "%s\n",
4550                           scx_ops_enable_state_str[scx_ops_enable_state()]);
4551 }
4552 SCX_ATTR(state);
4553
4554 static ssize_t scx_attr_switch_all_show(struct kobject *kobj,
4555                                         struct kobj_attribute *ka, char *buf)
4556 {
4557         return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all));
4558 }
4559 SCX_ATTR(switch_all);
4560
4561 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
4562                                          struct kobj_attribute *ka, char *buf)
4563 {
4564         return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected));
4565 }
4566 SCX_ATTR(nr_rejected);
4567
4568 static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
4569                                          struct kobj_attribute *ka, char *buf)
4570 {
4571         return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq));
4572 }
4573 SCX_ATTR(hotplug_seq);
4574
4575 static ssize_t scx_attr_enable_seq_show(struct kobject *kobj,
4576                                         struct kobj_attribute *ka, char *buf)
4577 {
4578         return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq));
4579 }
4580 SCX_ATTR(enable_seq);
4581
4582 static struct attribute *scx_global_attrs[] = {
4583         &scx_attr_state.attr,
4584         &scx_attr_switch_all.attr,
4585         &scx_attr_nr_rejected.attr,
4586         &scx_attr_hotplug_seq.attr,
4587         &scx_attr_enable_seq.attr,
4588         NULL,
4589 };
4590
4591 static const struct attribute_group scx_global_attr_group = {
4592         .attrs = scx_global_attrs,
4593 };
4594
4595 static void scx_kobj_release(struct kobject *kobj)
4596 {
4597         kfree(kobj);
4598 }
4599
4600 static ssize_t scx_attr_ops_show(struct kobject *kobj,
4601                                  struct kobj_attribute *ka, char *buf)
4602 {
4603         return sysfs_emit(buf, "%s\n", scx_ops.name);
4604 }
4605 SCX_ATTR(ops);
4606
4607 static struct attribute *scx_sched_attrs[] = {
4608         &scx_attr_ops.attr,
4609         NULL,
4610 };
4611 ATTRIBUTE_GROUPS(scx_sched);
4612
4613 static const struct kobj_type scx_ktype = {
4614         .release = scx_kobj_release,
4615         .sysfs_ops = &kobj_sysfs_ops,
4616         .default_groups = scx_sched_groups,
4617 };
4618
4619 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
4620 {
4621         return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
4622 }
4623
4624 static const struct kset_uevent_ops scx_uevent_ops = {
4625         .uevent = scx_uevent,
4626 };
4627
4628 /*
4629  * Used by sched_fork() and __setscheduler_prio() to pick the matching
4630  * sched_class. dl/rt are already handled.
4631  */
4632 bool task_should_scx(int policy)
4633 {
4634         if (!scx_enabled() ||
4635             unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
4636                 return false;
4637         if (READ_ONCE(scx_switching_all))
4638                 return true;
4639         return policy == SCHED_EXT;
4640 }
4641
4642 /**
4643  * scx_softlockup - sched_ext softlockup handler
4644  *
4645  * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
4646  * live-lock the system by making many CPUs target the same DSQ to the point
4647  * where soft-lockup detection triggers. This function is called from
4648  * soft-lockup watchdog when the triggering point is close and tries to unjam
4649  * the system by enabling the breather and aborting the BPF scheduler.
4650  */
4651 void scx_softlockup(u32 dur_s)
4652 {
4653         switch (scx_ops_enable_state()) {
4654         case SCX_OPS_ENABLING:
4655         case SCX_OPS_ENABLED:
4656                 break;
4657         default:
4658                 return;
4659         }
4660
4661         /* allow only one instance, cleared at the end of scx_ops_bypass() */
4662         if (test_and_set_bit(0, &scx_in_softlockup))
4663                 return;
4664
4665         printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
4666                         smp_processor_id(), dur_s, scx_ops.name);
4667
4668         /*
4669          * Some CPUs may be trapped in the dispatch paths. Enable breather
4670          * immediately; otherwise, we might even be able to get to
4671          * scx_ops_bypass().
4672          */
4673         atomic_inc(&scx_ops_breather_depth);
4674
4675         scx_ops_error("soft lockup - CPU#%d stuck for %us",
4676                       smp_processor_id(), dur_s);
4677 }
4678
4679 static void scx_clear_softlockup(void)
4680 {
4681         if (test_and_clear_bit(0, &scx_in_softlockup))
4682                 atomic_dec(&scx_ops_breather_depth);
4683 }
4684
4685 /**
4686  * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
4687  *
4688  * Bypassing guarantees that all runnable tasks make forward progress without
4689  * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
4690  * be held by tasks that the BPF scheduler is forgetting to run, which
4691  * unfortunately also excludes toggling the static branches.
4692  *
4693  * Let's work around by overriding a couple ops and modifying behaviors based on
4694  * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
4695  * to force global FIFO scheduling.
4696  *
4697  * - ops.select_cpu() is ignored and the default select_cpu() is used.
4698  *
4699  * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
4700  *   %SCX_OPS_ENQ_LAST is also ignored.
4701  *
4702  * - ops.dispatch() is ignored.
4703  *
4704  * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
4705  *   can't be trusted. Whenever a tick triggers, the running task is rotated to
4706  *   the tail of the queue with core_sched_at touched.
4707  *
4708  * - pick_next_task() suppresses zero slice warning.
4709  *
4710  * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
4711  *   operations.
4712  *
4713  * - scx_prio_less() reverts to the default core_sched_at order.
4714  */
4715 static void scx_ops_bypass(bool bypass)
4716 {
4717         static DEFINE_RAW_SPINLOCK(bypass_lock);
4718         int cpu;
4719         unsigned long flags;
4720
4721         raw_spin_lock_irqsave(&bypass_lock, flags);
4722         if (bypass) {
4723                 scx_ops_bypass_depth++;
4724                 WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
4725                 if (scx_ops_bypass_depth != 1)
4726                         goto unlock;
4727         } else {
4728                 scx_ops_bypass_depth--;
4729                 WARN_ON_ONCE(scx_ops_bypass_depth < 0);
4730                 if (scx_ops_bypass_depth != 0)
4731                         goto unlock;
4732         }
4733
4734         atomic_inc(&scx_ops_breather_depth);
4735
4736         /*
4737          * No task property is changing. We just need to make sure all currently
4738          * queued tasks are re-queued according to the new scx_rq_bypassing()
4739          * state. As an optimization, walk each rq's runnable_list instead of
4740          * the scx_tasks list.
4741          *
4742          * This function can't trust the scheduler and thus can't use
4743          * cpus_read_lock(). Walk all possible CPUs instead of online.
4744          */
4745         for_each_possible_cpu(cpu) {
4746                 struct rq *rq = cpu_rq(cpu);
4747                 struct rq_flags rf;
4748                 struct task_struct *p, *n;
4749
4750                 rq_lock(rq, &rf);
4751
4752                 if (bypass) {
4753                         WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
4754                         rq->scx.flags |= SCX_RQ_BYPASSING;
4755                 } else {
4756                         WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
4757                         rq->scx.flags &= ~SCX_RQ_BYPASSING;
4758                 }
4759
4760                 /*
4761                  * We need to guarantee that no tasks are on the BPF scheduler
4762                  * while bypassing. Either we see enabled or the enable path
4763                  * sees scx_rq_bypassing() before moving tasks to SCX.
4764                  */
4765                 if (!scx_enabled()) {
4766                         rq_unlock_irqrestore(rq, &rf);
4767                         continue;
4768                 }
4769
4770                 /*
4771                  * The use of list_for_each_entry_safe_reverse() is required
4772                  * because each task is going to be removed from and added back
4773                  * to the runnable_list during iteration. Because they're added
4774                  * to the tail of the list, safe reverse iteration can still
4775                  * visit all nodes.
4776                  */
4777                 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
4778                                                  scx.runnable_node) {
4779                         struct sched_enq_and_set_ctx ctx;
4780
4781                         /* cycling deq/enq is enough, see the function comment */
4782                         sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
4783                         sched_enq_and_set_task(&ctx);
4784                 }
4785
4786                 rq_unlock(rq, &rf);
4787
4788                 /* resched to restore ticks and idle state */
4789                 resched_cpu(cpu);
4790         }
4791
4792         atomic_dec(&scx_ops_breather_depth);
4793 unlock:
4794         raw_spin_unlock_irqrestore(&bypass_lock, flags);
4795         scx_clear_softlockup();
4796 }
4797
4798 static void free_exit_info(struct scx_exit_info *ei)
4799 {
4800         kfree(ei->dump);
4801         kfree(ei->msg);
4802         kfree(ei->bt);
4803         kfree(ei);
4804 }
4805
4806 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
4807 {
4808         struct scx_exit_info *ei;
4809
4810         ei = kzalloc(sizeof(*ei), GFP_KERNEL);
4811         if (!ei)
4812                 return NULL;
4813
4814         ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
4815         ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
4816         ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
4817
4818         if (!ei->bt || !ei->msg || !ei->dump) {
4819                 free_exit_info(ei);
4820                 return NULL;
4821         }
4822
4823         return ei;
4824 }
4825
4826 static const char *scx_exit_reason(enum scx_exit_kind kind)
4827 {
4828         switch (kind) {
4829         case SCX_EXIT_UNREG:
4830                 return "unregistered from user space";
4831         case SCX_EXIT_UNREG_BPF:
4832                 return "unregistered from BPF";
4833         case SCX_EXIT_UNREG_KERN:
4834                 return "unregistered from the main kernel";
4835         case SCX_EXIT_SYSRQ:
4836                 return "disabled by sysrq-S";
4837         case SCX_EXIT_ERROR:
4838                 return "runtime error";
4839         case SCX_EXIT_ERROR_BPF:
4840                 return "scx_bpf_error";
4841         case SCX_EXIT_ERROR_STALL:
4842                 return "runnable task stall";
4843         default:
4844                 return "<UNKNOWN>";
4845         }
4846 }
4847
4848 static void scx_ops_disable_workfn(struct kthread_work *work)
4849 {
4850         struct scx_exit_info *ei = scx_exit_info;
4851         struct scx_task_iter sti;
4852         struct task_struct *p;
4853         struct rhashtable_iter rht_iter;
4854         struct scx_dispatch_q *dsq;
4855         int i, kind;
4856
4857         kind = atomic_read(&scx_exit_kind);
4858         while (true) {
4859                 /*
4860                  * NONE indicates that a new scx_ops has been registered since
4861                  * disable was scheduled - don't kill the new ops. DONE
4862                  * indicates that the ops has already been disabled.
4863                  */
4864                 if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
4865                         return;
4866                 if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
4867                         break;
4868         }
4869         ei->kind = kind;
4870         ei->reason = scx_exit_reason(ei->kind);
4871
4872         /* guarantee forward progress by bypassing scx_ops */
4873         scx_ops_bypass(true);
4874
4875         switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
4876         case SCX_OPS_DISABLING:
4877                 WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
4878                 break;
4879         case SCX_OPS_DISABLED:
4880                 pr_warn("sched_ext: ops error detected without ops (%s)\n",
4881                         scx_exit_info->msg);
4882                 WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
4883                              SCX_OPS_DISABLING);
4884                 goto done;
4885         default:
4886                 break;
4887         }
4888
4889         /*
4890          * Here, every runnable task is guaranteed to make forward progress and
4891          * we can safely use blocking synchronization constructs. Actually
4892          * disable ops.
4893          */
4894         mutex_lock(&scx_ops_enable_mutex);
4895
4896         static_branch_disable(&__scx_switched_all);
4897         WRITE_ONCE(scx_switching_all, false);
4898
4899         /*
4900          * Shut down cgroup support before tasks so that the cgroup attach path
4901          * doesn't race against scx_ops_exit_task().
4902          */
4903         scx_cgroup_lock();
4904         scx_cgroup_exit();
4905         scx_cgroup_unlock();
4906
4907         /*
4908          * The BPF scheduler is going away. All tasks including %TASK_DEAD ones
4909          * must be switched out and exited synchronously.
4910          */
4911         percpu_down_write(&scx_fork_rwsem);
4912
4913         scx_ops_init_task_enabled = false;
4914
4915         scx_task_iter_start(&sti);
4916         while ((p = scx_task_iter_next_locked(&sti))) {
4917                 const struct sched_class *old_class = p->sched_class;
4918                 const struct sched_class *new_class =
4919                         __setscheduler_class(p->policy, p->prio);
4920                 struct sched_enq_and_set_ctx ctx;
4921
4922                 if (old_class != new_class && p->se.sched_delayed)
4923                         dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
4924
4925                 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
4926
4927                 p->sched_class = new_class;
4928                 check_class_changing(task_rq(p), p, old_class);
4929
4930                 sched_enq_and_set_task(&ctx);
4931
4932                 check_class_changed(task_rq(p), p, old_class, p->prio);
4933                 scx_ops_exit_task(p);
4934         }
4935         scx_task_iter_stop(&sti);
4936         percpu_up_write(&scx_fork_rwsem);
4937
4938         /* no task is on scx, turn off all the switches and flush in-progress calls */
4939         static_branch_disable(&__scx_ops_enabled);
4940         for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
4941                 static_branch_disable(&scx_has_op[i]);
4942         static_branch_disable(&scx_ops_enq_last);
4943         static_branch_disable(&scx_ops_enq_exiting);
4944         static_branch_disable(&scx_ops_cpu_preempt);
4945         static_branch_disable(&scx_builtin_idle_enabled);
4946         synchronize_rcu();
4947
4948         if (ei->kind >= SCX_EXIT_ERROR) {
4949                 pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
4950                        scx_ops.name, ei->reason);
4951
4952                 if (ei->msg[0] != '\0')
4953                         pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
4954 #ifdef CONFIG_STACKTRACE
4955                 stack_trace_print(ei->bt, ei->bt_len, 2);
4956 #endif
4957         } else {
4958                 pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
4959                         scx_ops.name, ei->reason);
4960         }
4961
4962         if (scx_ops.exit)
4963                 SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
4964
4965         cancel_delayed_work_sync(&scx_watchdog_work);
4966
4967         /*
4968          * Delete the kobject from the hierarchy eagerly in addition to just
4969          * dropping a reference. Otherwise, if the object is deleted
4970          * asynchronously, sysfs could observe an object of the same name still
4971          * in the hierarchy when another scheduler is loaded.
4972          */
4973         kobject_del(scx_root_kobj);
4974         kobject_put(scx_root_kobj);
4975         scx_root_kobj = NULL;
4976
4977         memset(&scx_ops, 0, sizeof(scx_ops));
4978
4979         rhashtable_walk_enter(&dsq_hash, &rht_iter);
4980         do {
4981                 rhashtable_walk_start(&rht_iter);
4982
4983                 while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
4984                         destroy_dsq(dsq->id);
4985
4986                 rhashtable_walk_stop(&rht_iter);
4987         } while (dsq == ERR_PTR(-EAGAIN));
4988         rhashtable_walk_exit(&rht_iter);
4989
4990         free_percpu(scx_dsp_ctx);
4991         scx_dsp_ctx = NULL;
4992         scx_dsp_max_batch = 0;
4993
4994         free_exit_info(scx_exit_info);
4995         scx_exit_info = NULL;
4996
4997         mutex_unlock(&scx_ops_enable_mutex);
4998
4999         WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
5000                      SCX_OPS_DISABLING);
5001 done:
5002         scx_ops_bypass(false);
5003 }
5004
5005 static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
5006
5007 static void schedule_scx_ops_disable_work(void)
5008 {
5009         struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
5010
5011         /*
5012          * We may be called spuriously before the first bpf_sched_ext_reg(). If
5013          * scx_ops_helper isn't set up yet, there's nothing to do.
5014          */
5015         if (helper)
5016                 kthread_queue_work(helper, &scx_ops_disable_work);
5017 }
5018
5019 static void scx_ops_disable(enum scx_exit_kind kind)
5020 {
5021         int none = SCX_EXIT_NONE;
5022
5023         if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
5024                 kind = SCX_EXIT_ERROR;
5025
5026         atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
5027
5028         schedule_scx_ops_disable_work();
5029 }
5030
5031 static void dump_newline(struct seq_buf *s)
5032 {
5033         trace_sched_ext_dump("");
5034
5035         /* @s may be zero sized and seq_buf triggers WARN if so */
5036         if (s->size)
5037                 seq_buf_putc(s, '\n');
5038 }
5039
5040 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
5041 {
5042         va_list args;
5043
5044 #ifdef CONFIG_TRACEPOINTS
5045         if (trace_sched_ext_dump_enabled()) {
5046                 /* protected by scx_dump_state()::dump_lock */
5047                 static char line_buf[SCX_EXIT_MSG_LEN];
5048
5049                 va_start(args, fmt);
5050                 vscnprintf(line_buf, sizeof(line_buf), fmt, args);
5051                 va_end(args);
5052
5053                 trace_sched_ext_dump(line_buf);
5054         }
5055 #endif
5056         /* @s may be zero sized and seq_buf triggers WARN if so */
5057         if (s->size) {
5058                 va_start(args, fmt);
5059                 seq_buf_vprintf(s, fmt, args);
5060                 va_end(args);
5061
5062                 seq_buf_putc(s, '\n');
5063         }
5064 }
5065
5066 static void dump_stack_trace(struct seq_buf *s, const char *prefix,
5067                              const unsigned long *bt, unsigned int len)
5068 {
5069         unsigned int i;
5070
5071         for (i = 0; i < len; i++)
5072                 dump_line(s, "%s%pS", prefix, (void *)bt[i]);
5073 }
5074
5075 static void ops_dump_init(struct seq_buf *s, const char *prefix)
5076 {
5077         struct scx_dump_data *dd = &scx_dump_data;
5078
5079         lockdep_assert_irqs_disabled();
5080
5081         dd->cpu = smp_processor_id();           /* allow scx_bpf_dump() */
5082         dd->first = true;
5083         dd->cursor = 0;
5084         dd->s = s;
5085         dd->prefix = prefix;
5086 }
5087
5088 static void ops_dump_flush(void)
5089 {
5090         struct scx_dump_data *dd = &scx_dump_data;
5091         char *line = dd->buf.line;
5092
5093         if (!dd->cursor)
5094                 return;
5095
5096         /*
5097          * There's something to flush and this is the first line. Insert a blank
5098          * line to distinguish ops dump.
5099          */
5100         if (dd->first) {
5101                 dump_newline(dd->s);
5102                 dd->first = false;
5103         }
5104
5105         /*
5106          * There may be multiple lines in $line. Scan and emit each line
5107          * separately.
5108          */
5109         while (true) {
5110                 char *end = line;
5111                 char c;
5112
5113                 while (*end != '\n' && *end != '\0')
5114                         end++;
5115
5116                 /*
5117                  * If $line overflowed, it may not have newline at the end.
5118                  * Always emit with a newline.
5119                  */
5120                 c = *end;
5121                 *end = '\0';
5122                 dump_line(dd->s, "%s%s", dd->prefix, line);
5123                 if (c == '\0')
5124                         break;
5125
5126                 /* move to the next line */
5127                 end++;
5128                 if (*end == '\0')
5129                         break;
5130                 line = end;
5131         }
5132
5133         dd->cursor = 0;
5134 }
5135
5136 static void ops_dump_exit(void)
5137 {
5138         ops_dump_flush();
5139         scx_dump_data.cpu = -1;
5140 }
5141
5142 static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
5143                           struct task_struct *p, char marker)
5144 {
5145         static unsigned long bt[SCX_EXIT_BT_LEN];
5146         char dsq_id_buf[19] = "(n/a)";
5147         unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
5148         unsigned int bt_len = 0;
5149
5150         if (p->scx.dsq)
5151                 scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
5152                           (unsigned long long)p->scx.dsq->id);
5153
5154         dump_newline(s);
5155         dump_line(s, " %c%c %s[%d] %+ldms",
5156                   marker, task_state_to_char(p), p->comm, p->pid,
5157                   jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
5158         dump_line(s, "      scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu",
5159                   scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
5160                   p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
5161                   ops_state >> SCX_OPSS_QSEQ_SHIFT);
5162         dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu",
5163                   p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
5164                   p->scx.dsq_vtime);
5165         dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
5166
5167         if (SCX_HAS_OP(dump_task)) {
5168                 ops_dump_init(s, "    ");
5169                 SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
5170                 ops_dump_exit();
5171         }
5172
5173 #ifdef CONFIG_STACKTRACE
5174         bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
5175 #endif
5176         if (bt_len) {
5177                 dump_newline(s);
5178                 dump_stack_trace(s, "    ", bt, bt_len);
5179         }
5180 }
5181
5182 static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
5183 {
5184         static DEFINE_SPINLOCK(dump_lock);
5185         static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
5186         struct scx_dump_ctx dctx = {
5187                 .kind = ei->kind,
5188                 .exit_code = ei->exit_code,
5189                 .reason = ei->reason,
5190                 .at_ns = ktime_get_ns(),
5191                 .at_jiffies = jiffies,
5192         };
5193         struct seq_buf s;
5194         unsigned long flags;
5195         char *buf;
5196         int cpu;
5197
5198         spin_lock_irqsave(&dump_lock, flags);
5199
5200         seq_buf_init(&s, ei->dump, dump_len);
5201
5202         if (ei->kind == SCX_EXIT_NONE) {
5203                 dump_line(&s, "Debug dump triggered by %s", ei->reason);
5204         } else {
5205                 dump_line(&s, "%s[%d] triggered exit kind %d:",
5206                           current->comm, current->pid, ei->kind);
5207                 dump_line(&s, "  %s (%s)", ei->reason, ei->msg);
5208                 dump_newline(&s);
5209                 dump_line(&s, "Backtrace:");
5210                 dump_stack_trace(&s, "  ", ei->bt, ei->bt_len);
5211         }
5212
5213         if (SCX_HAS_OP(dump)) {
5214                 ops_dump_init(&s, "");
5215                 SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
5216                 ops_dump_exit();
5217         }
5218
5219         dump_newline(&s);
5220         dump_line(&s, "CPU states");
5221         dump_line(&s, "----------");
5222
5223         for_each_possible_cpu(cpu) {
5224                 struct rq *rq = cpu_rq(cpu);
5225                 struct rq_flags rf;
5226                 struct task_struct *p;
5227                 struct seq_buf ns;
5228                 size_t avail, used;
5229                 bool idle;
5230
5231                 rq_lock(rq, &rf);
5232
5233                 idle = list_empty(&rq->scx.runnable_list) &&
5234                         rq->curr->sched_class == &idle_sched_class;
5235
5236                 if (idle && !SCX_HAS_OP(dump_cpu))
5237                         goto next;
5238
5239                 /*
5240                  * We don't yet know whether ops.dump_cpu() will produce output
5241                  * and we may want to skip the default CPU dump if it doesn't.
5242                  * Use a nested seq_buf to generate the standard dump so that we
5243                  * can decide whether to commit later.
5244                  */
5245                 avail = seq_buf_get_buf(&s, &buf);
5246                 seq_buf_init(&ns, buf, avail);
5247
5248                 dump_newline(&ns);
5249                 dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu",
5250                           cpu, rq->scx.nr_running, rq->scx.flags,
5251                           rq->scx.cpu_released, rq->scx.ops_qseq,
5252                           rq->scx.pnt_seq);
5253                 dump_line(&ns, "          curr=%s[%d] class=%ps",
5254                           rq->curr->comm, rq->curr->pid,
5255                           rq->curr->sched_class);
5256                 if (!cpumask_empty(rq->scx.cpus_to_kick))
5257                         dump_line(&ns, "  cpus_to_kick   : %*pb",
5258                                   cpumask_pr_args(rq->scx.cpus_to_kick));
5259                 if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
5260                         dump_line(&ns, "  idle_to_kick   : %*pb",
5261                                   cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
5262                 if (!cpumask_empty(rq->scx.cpus_to_preempt))
5263                         dump_line(&ns, "  cpus_to_preempt: %*pb",
5264                                   cpumask_pr_args(rq->scx.cpus_to_preempt));
5265                 if (!cpumask_empty(rq->scx.cpus_to_wait))
5266                         dump_line(&ns, "  cpus_to_wait   : %*pb",
5267                                   cpumask_pr_args(rq->scx.cpus_to_wait));
5268
5269                 used = seq_buf_used(&ns);
5270                 if (SCX_HAS_OP(dump_cpu)) {
5271                         ops_dump_init(&ns, "  ");
5272                         SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
5273                         ops_dump_exit();
5274                 }
5275
5276                 /*
5277                  * If idle && nothing generated by ops.dump_cpu(), there's
5278                  * nothing interesting. Skip.
5279                  */
5280                 if (idle && used == seq_buf_used(&ns))
5281                         goto next;
5282
5283                 /*
5284                  * $s may already have overflowed when $ns was created. If so,
5285                  * calling commit on it will trigger BUG.
5286                  */
5287                 if (avail) {
5288                         seq_buf_commit(&s, seq_buf_used(&ns));
5289                         if (seq_buf_has_overflowed(&ns))
5290                                 seq_buf_set_overflow(&s);
5291                 }
5292
5293                 if (rq->curr->sched_class == &ext_sched_class)
5294                         scx_dump_task(&s, &dctx, rq->curr, '*');
5295
5296                 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
5297                         scx_dump_task(&s, &dctx, p, ' ');
5298         next:
5299                 rq_unlock(rq, &rf);
5300         }
5301
5302         if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
5303                 memcpy(ei->dump + dump_len - sizeof(trunc_marker),
5304                        trunc_marker, sizeof(trunc_marker));
5305
5306         spin_unlock_irqrestore(&dump_lock, flags);
5307 }
5308
5309 static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
5310 {
5311         struct scx_exit_info *ei = scx_exit_info;
5312
5313         if (ei->kind >= SCX_EXIT_ERROR)
5314                 scx_dump_state(ei, scx_ops.exit_dump_len);
5315
5316         schedule_scx_ops_disable_work();
5317 }
5318
5319 static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
5320
5321 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
5322                                              s64 exit_code,
5323                                              const char *fmt, ...)
5324 {
5325         struct scx_exit_info *ei = scx_exit_info;
5326         int none = SCX_EXIT_NONE;
5327         va_list args;
5328
5329         if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
5330                 return;
5331
5332         ei->exit_code = exit_code;
5333 #ifdef CONFIG_STACKTRACE
5334         if (kind >= SCX_EXIT_ERROR)
5335                 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
5336 #endif
5337         va_start(args, fmt);
5338         vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
5339         va_end(args);
5340
5341         /*
5342          * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
5343          * in scx_ops_disable_workfn().
5344          */
5345         ei->kind = kind;
5346         ei->reason = scx_exit_reason(ei->kind);
5347
5348         irq_work_queue(&scx_ops_error_irq_work);
5349 }
5350
5351 static struct kthread_worker *scx_create_rt_helper(const char *name)
5352 {
5353         struct kthread_worker *helper;
5354
5355         helper = kthread_create_worker(0, name);
5356         if (helper)
5357                 sched_set_fifo(helper->task);
5358         return helper;
5359 }
5360
5361 static void check_hotplug_seq(const struct sched_ext_ops *ops)
5362 {
5363         unsigned long long global_hotplug_seq;
5364
5365         /*
5366          * If a hotplug event has occurred between when a scheduler was
5367          * initialized, and when we were able to attach, exit and notify user
5368          * space about it.
5369          */
5370         if (ops->hotplug_seq) {
5371                 global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
5372                 if (ops->hotplug_seq != global_hotplug_seq) {
5373                         scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
5374                                      "expected hotplug seq %llu did not match actual %llu",
5375                                      ops->hotplug_seq, global_hotplug_seq);
5376                 }
5377         }
5378 }
5379
5380 static int validate_ops(const struct sched_ext_ops *ops)
5381 {
5382         /*
5383          * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
5384          * ops.enqueue() callback isn't implemented.
5385          */
5386         if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
5387                 scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
5388                 return -EINVAL;
5389         }
5390
5391         return 0;
5392 }
5393
5394 static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
5395 {
5396         struct scx_task_iter sti;
5397         struct task_struct *p;
5398         unsigned long timeout;
5399         int i, cpu, node, ret;
5400
5401         if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
5402                            cpu_possible_mask)) {
5403                 pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
5404                 return -EINVAL;
5405         }
5406
5407         mutex_lock(&scx_ops_enable_mutex);
5408
5409         if (!scx_ops_helper) {
5410                 WRITE_ONCE(scx_ops_helper,
5411                            scx_create_rt_helper("sched_ext_ops_helper"));
5412                 if (!scx_ops_helper) {
5413                         ret = -ENOMEM;
5414                         goto err_unlock;
5415                 }
5416         }
5417
5418         if (!global_dsqs) {
5419                 struct scx_dispatch_q **dsqs;
5420
5421                 dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
5422                 if (!dsqs) {
5423                         ret = -ENOMEM;
5424                         goto err_unlock;
5425                 }
5426
5427                 for_each_node_state(node, N_POSSIBLE) {
5428                         struct scx_dispatch_q *dsq;
5429
5430                         dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
5431                         if (!dsq) {
5432                                 for_each_node_state(node, N_POSSIBLE)
5433                                         kfree(dsqs[node]);
5434                                 kfree(dsqs);
5435                                 ret = -ENOMEM;
5436                                 goto err_unlock;
5437                         }
5438
5439                         init_dsq(dsq, SCX_DSQ_GLOBAL);
5440                         dsqs[node] = dsq;
5441                 }
5442
5443                 global_dsqs = dsqs;
5444         }
5445
5446         if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
5447                 ret = -EBUSY;
5448                 goto err_unlock;
5449         }
5450
5451         scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
5452         if (!scx_root_kobj) {
5453                 ret = -ENOMEM;
5454                 goto err_unlock;
5455         }
5456
5457         scx_root_kobj->kset = scx_kset;
5458         ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
5459         if (ret < 0)
5460                 goto err;
5461
5462         scx_exit_info = alloc_exit_info(ops->exit_dump_len);
5463         if (!scx_exit_info) {
5464                 ret = -ENOMEM;
5465                 goto err_del;
5466         }
5467
5468         /*
5469          * Set scx_ops, transition to ENABLING and clear exit info to arm the
5470          * disable path. Failure triggers full disabling from here on.
5471          */
5472         scx_ops = *ops;
5473
5474         WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) !=
5475                      SCX_OPS_DISABLED);
5476
5477         atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
5478         scx_warned_zero_slice = false;
5479
5480         atomic_long_set(&scx_nr_rejected, 0);
5481
5482         for_each_possible_cpu(cpu)
5483                 cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
5484
5485         /*
5486          * Keep CPUs stable during enable so that the BPF scheduler can track
5487          * online CPUs by watching ->on/offline_cpu() after ->init().
5488          */
5489         cpus_read_lock();
5490
5491         if (scx_ops.init) {
5492                 ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
5493                 if (ret) {
5494                         ret = ops_sanitize_err("init", ret);
5495                         cpus_read_unlock();
5496                         scx_ops_error("ops.init() failed (%d)", ret);
5497                         goto err_disable;
5498                 }
5499         }
5500
5501         for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
5502                 if (((void (**)(void))ops)[i])
5503                         static_branch_enable_cpuslocked(&scx_has_op[i]);
5504
5505         check_hotplug_seq(ops);
5506 #ifdef CONFIG_SMP
5507         update_selcpu_topology();
5508 #endif
5509         cpus_read_unlock();
5510
5511         ret = validate_ops(ops);
5512         if (ret)
5513                 goto err_disable;
5514
5515         WARN_ON_ONCE(scx_dsp_ctx);
5516         scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
5517         scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf,
5518                                                    scx_dsp_max_batch),
5519                                      __alignof__(struct scx_dsp_ctx));
5520         if (!scx_dsp_ctx) {
5521                 ret = -ENOMEM;
5522                 goto err_disable;
5523         }
5524
5525         if (ops->timeout_ms)
5526                 timeout = msecs_to_jiffies(ops->timeout_ms);
5527         else
5528                 timeout = SCX_WATCHDOG_MAX_TIMEOUT;
5529
5530         WRITE_ONCE(scx_watchdog_timeout, timeout);
5531         WRITE_ONCE(scx_watchdog_timestamp, jiffies);
5532         queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
5533                            scx_watchdog_timeout / 2);
5534
5535         /*
5536          * Once __scx_ops_enabled is set, %current can be switched to SCX
5537          * anytime. This can lead to stalls as some BPF schedulers (e.g.
5538          * userspace scheduling) may not function correctly before all tasks are
5539          * switched. Init in bypass mode to guarantee forward progress.
5540          */
5541         scx_ops_bypass(true);
5542
5543         for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
5544                 if (((void (**)(void))ops)[i])
5545                         static_branch_enable(&scx_has_op[i]);
5546
5547         if (ops->flags & SCX_OPS_ENQ_LAST)
5548                 static_branch_enable(&scx_ops_enq_last);
5549
5550         if (ops->flags & SCX_OPS_ENQ_EXITING)
5551                 static_branch_enable(&scx_ops_enq_exiting);
5552         if (scx_ops.cpu_acquire || scx_ops.cpu_release)
5553                 static_branch_enable(&scx_ops_cpu_preempt);
5554
5555         if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
5556                 reset_idle_masks();
5557                 static_branch_enable(&scx_builtin_idle_enabled);
5558         } else {
5559                 static_branch_disable(&scx_builtin_idle_enabled);
5560         }
5561
5562         /*
5563          * Lock out forks, cgroup on/offlining and moves before opening the
5564          * floodgate so that they don't wander into the operations prematurely.
5565          */
5566         percpu_down_write(&scx_fork_rwsem);
5567
5568         WARN_ON_ONCE(scx_ops_init_task_enabled);
5569         scx_ops_init_task_enabled = true;
5570
5571         /*
5572          * Enable ops for every task. Fork is excluded by scx_fork_rwsem
5573          * preventing new tasks from being added. No need to exclude tasks
5574          * leaving as sched_ext_free() can handle both prepped and enabled
5575          * tasks. Prep all tasks first and then enable them with preemption
5576          * disabled.
5577          *
5578          * All cgroups should be initialized before scx_ops_init_task() so that
5579          * the BPF scheduler can reliably track each task's cgroup membership
5580          * from scx_ops_init_task(). Lock out cgroup on/offlining and task
5581          * migrations while tasks are being initialized so that
5582          * scx_cgroup_can_attach() never sees uninitialized tasks.
5583          */
5584         scx_cgroup_lock();
5585         ret = scx_cgroup_init();
5586         if (ret)
5587                 goto err_disable_unlock_all;
5588
5589         scx_task_iter_start(&sti);
5590         while ((p = scx_task_iter_next_locked(&sti))) {
5591                 /*
5592                  * @p may already be dead, have lost all its usages counts and
5593                  * be waiting for RCU grace period before being freed. @p can't
5594                  * be initialized for SCX in such cases and should be ignored.
5595                  */
5596                 if (!tryget_task_struct(p))
5597                         continue;
5598
5599                 scx_task_iter_unlock(&sti);
5600
5601                 ret = scx_ops_init_task(p, task_group(p), false);
5602                 if (ret) {
5603                         put_task_struct(p);
5604                         scx_task_iter_relock(&sti);
5605                         scx_task_iter_stop(&sti);
5606                         scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
5607                                       ret, p->comm, p->pid);
5608                         goto err_disable_unlock_all;
5609                 }
5610
5611                 scx_set_task_state(p, SCX_TASK_READY);
5612
5613                 put_task_struct(p);
5614                 scx_task_iter_relock(&sti);
5615         }
5616         scx_task_iter_stop(&sti);
5617         scx_cgroup_unlock();
5618         percpu_up_write(&scx_fork_rwsem);
5619
5620         /*
5621          * All tasks are READY. It's safe to turn on scx_enabled() and switch
5622          * all eligible tasks.
5623          */
5624         WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
5625         static_branch_enable(&__scx_ops_enabled);
5626
5627         /*
5628          * We're fully committed and can't fail. The task READY -> ENABLED
5629          * transitions here are synchronized against sched_ext_free() through
5630          * scx_tasks_lock.
5631          */
5632         percpu_down_write(&scx_fork_rwsem);
5633         scx_task_iter_start(&sti);
5634         while ((p = scx_task_iter_next_locked(&sti))) {
5635                 const struct sched_class *old_class = p->sched_class;
5636                 const struct sched_class *new_class =
5637                         __setscheduler_class(p->policy, p->prio);
5638                 struct sched_enq_and_set_ctx ctx;
5639
5640                 if (old_class != new_class && p->se.sched_delayed)
5641                         dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
5642
5643                 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
5644
5645                 p->scx.slice = SCX_SLICE_DFL;
5646                 p->sched_class = new_class;
5647                 check_class_changing(task_rq(p), p, old_class);
5648
5649                 sched_enq_and_set_task(&ctx);
5650
5651                 check_class_changed(task_rq(p), p, old_class, p->prio);
5652         }
5653         scx_task_iter_stop(&sti);
5654         percpu_up_write(&scx_fork_rwsem);
5655
5656         scx_ops_bypass(false);
5657
5658         if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
5659                 WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
5660                 goto err_disable;
5661         }
5662
5663         if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
5664                 static_branch_enable(&__scx_switched_all);
5665
5666         pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
5667                 scx_ops.name, scx_switched_all() ? "" : " (partial)");
5668         kobject_uevent(scx_root_kobj, KOBJ_ADD);
5669         mutex_unlock(&scx_ops_enable_mutex);
5670
5671         atomic_long_inc(&scx_enable_seq);
5672
5673         return 0;
5674
5675 err_del:
5676         kobject_del(scx_root_kobj);
5677 err:
5678         kobject_put(scx_root_kobj);
5679         scx_root_kobj = NULL;
5680         if (scx_exit_info) {
5681                 free_exit_info(scx_exit_info);
5682                 scx_exit_info = NULL;
5683         }
5684 err_unlock:
5685         mutex_unlock(&scx_ops_enable_mutex);
5686         return ret;
5687
5688 err_disable_unlock_all:
5689         scx_cgroup_unlock();
5690         percpu_up_write(&scx_fork_rwsem);
5691         scx_ops_bypass(false);
5692 err_disable:
5693         mutex_unlock(&scx_ops_enable_mutex);
5694         /*
5695          * Returning an error code here would not pass all the error information
5696          * to userspace. Record errno using scx_ops_error() for cases
5697          * scx_ops_error() wasn't already invoked and exit indicating success so
5698          * that the error is notified through ops.exit() with all the details.
5699          *
5700          * Flush scx_ops_disable_work to ensure that error is reported before
5701          * init completion.
5702          */
5703         scx_ops_error("scx_ops_enable() failed (%d)", ret);
5704         kthread_flush_work(&scx_ops_disable_work);
5705         return 0;
5706 }
5707
5708
5709 /********************************************************************************
5710  * bpf_struct_ops plumbing.
5711  */
5712 #include <linux/bpf_verifier.h>
5713 #include <linux/bpf.h>
5714 #include <linux/btf.h>
5715
5716 static const struct btf_type *task_struct_type;
5717
5718 static bool bpf_scx_is_valid_access(int off, int size,
5719                                     enum bpf_access_type type,
5720                                     const struct bpf_prog *prog,
5721                                     struct bpf_insn_access_aux *info)
5722 {
5723         if (type != BPF_READ)
5724                 return false;
5725         if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
5726                 return false;
5727         if (off % size != 0)
5728                 return false;
5729
5730         return btf_ctx_access(off, size, type, prog, info);
5731 }
5732
5733 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
5734                                      const struct bpf_reg_state *reg, int off,
5735                                      int size)
5736 {
5737         const struct btf_type *t;
5738
5739         t = btf_type_by_id(reg->btf, reg->btf_id);
5740         if (t == task_struct_type) {
5741                 if (off >= offsetof(struct task_struct, scx.slice) &&
5742                     off + size <= offsetofend(struct task_struct, scx.slice))
5743                         return SCALAR_VALUE;
5744                 if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
5745                     off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
5746                         return SCALAR_VALUE;
5747                 if (off >= offsetof(struct task_struct, scx.disallow) &&
5748                     off + size <= offsetofend(struct task_struct, scx.disallow))
5749                         return SCALAR_VALUE;
5750         }
5751
5752         return -EACCES;
5753 }
5754
5755 static const struct bpf_func_proto *
5756 bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5757 {
5758         switch (func_id) {
5759         case BPF_FUNC_task_storage_get:
5760                 return &bpf_task_storage_get_proto;
5761         case BPF_FUNC_task_storage_delete:
5762                 return &bpf_task_storage_delete_proto;
5763         default:
5764                 return bpf_base_func_proto(func_id, prog);
5765         }
5766 }
5767
5768 static const struct bpf_verifier_ops bpf_scx_verifier_ops = {
5769         .get_func_proto = bpf_scx_get_func_proto,
5770         .is_valid_access = bpf_scx_is_valid_access,
5771         .btf_struct_access = bpf_scx_btf_struct_access,
5772 };
5773
5774 static int bpf_scx_init_member(const struct btf_type *t,
5775                                const struct btf_member *member,
5776                                void *kdata, const void *udata)
5777 {
5778         const struct sched_ext_ops *uops = udata;
5779         struct sched_ext_ops *ops = kdata;
5780         u32 moff = __btf_member_bit_offset(t, member) / 8;
5781         int ret;
5782
5783         switch (moff) {
5784         case offsetof(struct sched_ext_ops, dispatch_max_batch):
5785                 if (*(u32 *)(udata + moff) > INT_MAX)
5786                         return -E2BIG;
5787                 ops->dispatch_max_batch = *(u32 *)(udata + moff);
5788                 return 1;
5789         case offsetof(struct sched_ext_ops, flags):
5790                 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS)
5791                         return -EINVAL;
5792                 ops->flags = *(u64 *)(udata + moff);
5793                 return 1;
5794         case offsetof(struct sched_ext_ops, name):
5795                 ret = bpf_obj_name_cpy(ops->name, uops->name,
5796                                        sizeof(ops->name));
5797                 if (ret < 0)
5798                         return ret;
5799                 if (ret == 0)
5800                         return -EINVAL;
5801                 return 1;
5802         case offsetof(struct sched_ext_ops, timeout_ms):
5803                 if (msecs_to_jiffies(*(u32 *)(udata + moff)) >
5804                     SCX_WATCHDOG_MAX_TIMEOUT)
5805                         return -E2BIG;
5806                 ops->timeout_ms = *(u32 *)(udata + moff);
5807                 return 1;
5808         case offsetof(struct sched_ext_ops, exit_dump_len):
5809                 ops->exit_dump_len =
5810                         *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
5811                 return 1;
5812         case offsetof(struct sched_ext_ops, hotplug_seq):
5813                 ops->hotplug_seq = *(u64 *)(udata + moff);
5814                 return 1;
5815         }
5816
5817         return 0;
5818 }
5819
5820 static int bpf_scx_check_member(const struct btf_type *t,
5821                                 const struct btf_member *member,
5822                                 const struct bpf_prog *prog)
5823 {
5824         u32 moff = __btf_member_bit_offset(t, member) / 8;
5825
5826         switch (moff) {
5827         case offsetof(struct sched_ext_ops, init_task):
5828 #ifdef CONFIG_EXT_GROUP_SCHED
5829         case offsetof(struct sched_ext_ops, cgroup_init):
5830         case offsetof(struct sched_ext_ops, cgroup_exit):
5831         case offsetof(struct sched_ext_ops, cgroup_prep_move):
5832 #endif
5833         case offsetof(struct sched_ext_ops, cpu_online):
5834         case offsetof(struct sched_ext_ops, cpu_offline):
5835         case offsetof(struct sched_ext_ops, init):
5836         case offsetof(struct sched_ext_ops, exit):
5837                 break;
5838         default:
5839                 if (prog->sleepable)
5840                         return -EINVAL;
5841         }
5842
5843         return 0;
5844 }
5845
5846 static int bpf_scx_reg(void *kdata, struct bpf_link *link)
5847 {
5848         return scx_ops_enable(kdata, link);
5849 }
5850
5851 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
5852 {
5853         scx_ops_disable(SCX_EXIT_UNREG);
5854         kthread_flush_work(&scx_ops_disable_work);
5855 }
5856
5857 static int bpf_scx_init(struct btf *btf)
5858 {
5859         task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]);
5860
5861         return 0;
5862 }
5863
5864 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link)
5865 {
5866         /*
5867          * sched_ext does not support updating the actively-loaded BPF
5868          * scheduler, as registering a BPF scheduler can always fail if the
5869          * scheduler returns an error code for e.g. ops.init(), ops.init_task(),
5870          * etc. Similarly, we can always race with unregistration happening
5871          * elsewhere, such as with sysrq.
5872          */
5873         return -EOPNOTSUPP;
5874 }
5875
5876 static int bpf_scx_validate(void *kdata)
5877 {
5878         return 0;
5879 }
5880
5881 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
5882 static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {}
5883 static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {}
5884 static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {}
5885 static void sched_ext_ops__tick(struct task_struct *p) {}
5886 static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {}
5887 static void sched_ext_ops__running(struct task_struct *p) {}
5888 static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {}
5889 static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {}
5890 static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; }
5891 static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; }
5892 static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {}
5893 static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {}
5894 static void sched_ext_ops__update_idle(s32 cpu, bool idle) {}
5895 static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {}
5896 static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {}
5897 static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
5898 static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {}
5899 static void sched_ext_ops__enable(struct task_struct *p) {}
5900 static void sched_ext_ops__disable(struct task_struct *p) {}
5901 #ifdef CONFIG_EXT_GROUP_SCHED
5902 static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
5903 static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {}
5904 static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
5905 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
5906 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
5907 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
5908 #endif
5909 static void sched_ext_ops__cpu_online(s32 cpu) {}
5910 static void sched_ext_ops__cpu_offline(s32 cpu) {}
5911 static s32 sched_ext_ops__init(void) { return -EINVAL; }
5912 static void sched_ext_ops__exit(struct scx_exit_info *info) {}
5913 static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {}
5914 static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
5915 static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {}
5916
5917 static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
5918         .select_cpu             = sched_ext_ops__select_cpu,
5919         .enqueue                = sched_ext_ops__enqueue,
5920         .dequeue                = sched_ext_ops__dequeue,
5921         .dispatch               = sched_ext_ops__dispatch,
5922         .tick                   = sched_ext_ops__tick,
5923         .runnable               = sched_ext_ops__runnable,
5924         .running                = sched_ext_ops__running,
5925         .stopping               = sched_ext_ops__stopping,
5926         .quiescent              = sched_ext_ops__quiescent,
5927         .yield                  = sched_ext_ops__yield,
5928         .core_sched_before      = sched_ext_ops__core_sched_before,
5929         .set_weight             = sched_ext_ops__set_weight,
5930         .set_cpumask            = sched_ext_ops__set_cpumask,
5931         .update_idle            = sched_ext_ops__update_idle,
5932         .cpu_acquire            = sched_ext_ops__cpu_acquire,
5933         .cpu_release            = sched_ext_ops__cpu_release,
5934         .init_task              = sched_ext_ops__init_task,
5935         .exit_task              = sched_ext_ops__exit_task,
5936         .enable                 = sched_ext_ops__enable,
5937         .disable                = sched_ext_ops__disable,
5938 #ifdef CONFIG_EXT_GROUP_SCHED
5939         .cgroup_init            = sched_ext_ops__cgroup_init,
5940         .cgroup_exit            = sched_ext_ops__cgroup_exit,
5941         .cgroup_prep_move       = sched_ext_ops__cgroup_prep_move,
5942         .cgroup_move            = sched_ext_ops__cgroup_move,
5943         .cgroup_cancel_move     = sched_ext_ops__cgroup_cancel_move,
5944         .cgroup_set_weight      = sched_ext_ops__cgroup_set_weight,
5945 #endif
5946         .cpu_online             = sched_ext_ops__cpu_online,
5947         .cpu_offline            = sched_ext_ops__cpu_offline,
5948         .init                   = sched_ext_ops__init,
5949         .exit                   = sched_ext_ops__exit,
5950         .dump                   = sched_ext_ops__dump,
5951         .dump_cpu               = sched_ext_ops__dump_cpu,
5952         .dump_task              = sched_ext_ops__dump_task,
5953 };
5954
5955 static struct bpf_struct_ops bpf_sched_ext_ops = {
5956         .verifier_ops = &bpf_scx_verifier_ops,
5957         .reg = bpf_scx_reg,
5958         .unreg = bpf_scx_unreg,
5959         .check_member = bpf_scx_check_member,
5960         .init_member = bpf_scx_init_member,
5961         .init = bpf_scx_init,
5962         .update = bpf_scx_update,
5963         .validate = bpf_scx_validate,
5964         .name = "sched_ext_ops",
5965         .owner = THIS_MODULE,
5966         .cfi_stubs = &__bpf_ops_sched_ext_ops
5967 };
5968
5969
5970 /********************************************************************************
5971  * System integration and init.
5972  */
5973
5974 static void sysrq_handle_sched_ext_reset(u8 key)
5975 {
5976         if (scx_ops_helper)
5977                 scx_ops_disable(SCX_EXIT_SYSRQ);
5978         else
5979                 pr_info("sched_ext: BPF scheduler not yet used\n");
5980 }
5981
5982 static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
5983         .handler        = sysrq_handle_sched_ext_reset,
5984         .help_msg       = "reset-sched-ext(S)",
5985         .action_msg     = "Disable sched_ext and revert all tasks to CFS",
5986         .enable_mask    = SYSRQ_ENABLE_RTNICE,
5987 };
5988
5989 static void sysrq_handle_sched_ext_dump(u8 key)
5990 {
5991         struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
5992
5993         if (scx_enabled())
5994                 scx_dump_state(&ei, 0);
5995 }
5996
5997 static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
5998         .handler        = sysrq_handle_sched_ext_dump,
5999         .help_msg       = "dump-sched-ext(D)",
6000         .action_msg     = "Trigger sched_ext debug dump",
6001         .enable_mask    = SYSRQ_ENABLE_RTNICE,
6002 };
6003
6004 static bool can_skip_idle_kick(struct rq *rq)
6005 {
6006         lockdep_assert_rq_held(rq);
6007
6008         /*
6009          * We can skip idle kicking if @rq is going to go through at least one
6010          * full SCX scheduling cycle before going idle. Just checking whether
6011          * curr is not idle is insufficient because we could be racing
6012          * balance_one() trying to pull the next task from a remote rq, which
6013          * may fail, and @rq may become idle afterwards.
6014          *
6015          * The race window is small and we don't and can't guarantee that @rq is
6016          * only kicked while idle anyway. Skip only when sure.
6017          */
6018         return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
6019 }
6020
6021 static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs)
6022 {
6023         struct rq *rq = cpu_rq(cpu);
6024         struct scx_rq *this_scx = &this_rq->scx;
6025         bool should_wait = false;
6026         unsigned long flags;
6027
6028         raw_spin_rq_lock_irqsave(rq, flags);
6029
6030         /*
6031          * During CPU hotplug, a CPU may depend on kicking itself to make
6032          * forward progress. Allow kicking self regardless of online state.
6033          */
6034         if (cpu_online(cpu) || cpu == cpu_of(this_rq)) {
6035                 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
6036                         if (rq->curr->sched_class == &ext_sched_class)
6037                                 rq->curr->scx.slice = 0;
6038                         cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
6039                 }
6040
6041                 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
6042                         pseqs[cpu] = rq->scx.pnt_seq;
6043                         should_wait = true;
6044                 }
6045
6046                 resched_curr(rq);
6047         } else {
6048                 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
6049                 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
6050         }
6051
6052         raw_spin_rq_unlock_irqrestore(rq, flags);
6053
6054         return should_wait;
6055 }
6056
6057 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
6058 {
6059         struct rq *rq = cpu_rq(cpu);
6060         unsigned long flags;
6061
6062         raw_spin_rq_lock_irqsave(rq, flags);
6063
6064         if (!can_skip_idle_kick(rq) &&
6065             (cpu_online(cpu) || cpu == cpu_of(this_rq)))
6066                 resched_curr(rq);
6067
6068         raw_spin_rq_unlock_irqrestore(rq, flags);
6069 }
6070
6071 static void kick_cpus_irq_workfn(struct irq_work *irq_work)
6072 {
6073         struct rq *this_rq = this_rq();
6074         struct scx_rq *this_scx = &this_rq->scx;
6075         unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
6076         bool should_wait = false;
6077         s32 cpu;
6078
6079         for_each_cpu(cpu, this_scx->cpus_to_kick) {
6080                 should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
6081                 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
6082                 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
6083         }
6084
6085         for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {
6086                 kick_one_cpu_if_idle(cpu, this_rq);
6087                 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
6088         }
6089
6090         if (!should_wait)
6091                 return;
6092
6093         for_each_cpu(cpu, this_scx->cpus_to_wait) {
6094                 unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq;
6095
6096                 if (cpu != cpu_of(this_rq)) {
6097                         /*
6098                          * Pairs with smp_store_release() issued by this CPU in
6099                          * switch_class() on the resched path.
6100                          *
6101                          * We busy-wait here to guarantee that no other task can
6102                          * be scheduled on our core before the target CPU has
6103                          * entered the resched path.
6104                          */
6105                         while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu])
6106                                 cpu_relax();
6107                 }
6108
6109                 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
6110         }
6111 }
6112
6113 /**
6114  * print_scx_info - print out sched_ext scheduler state
6115  * @log_lvl: the log level to use when printing
6116  * @p: target task
6117  *
6118  * If a sched_ext scheduler is enabled, print the name and state of the
6119  * scheduler. If @p is on sched_ext, print further information about the task.
6120  *
6121  * This function can be safely called on any task as long as the task_struct
6122  * itself is accessible. While safe, this function isn't synchronized and may
6123  * print out mixups or garbages of limited length.
6124  */
6125 void print_scx_info(const char *log_lvl, struct task_struct *p)
6126 {
6127         enum scx_ops_enable_state state = scx_ops_enable_state();
6128         const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
6129         char runnable_at_buf[22] = "?";
6130         struct sched_class *class;
6131         unsigned long runnable_at;
6132
6133         if (state == SCX_OPS_DISABLED)
6134                 return;
6135
6136         /*
6137          * Carefully check if the task was running on sched_ext, and then
6138          * carefully copy the time it's been runnable, and its state.
6139          */
6140         if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
6141             class != &ext_sched_class) {
6142                 printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
6143                        scx_ops_enable_state_str[state], all);
6144                 return;
6145         }
6146
6147         if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
6148                                       sizeof(runnable_at)))
6149                 scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms",
6150                           jiffies_delta_msecs(runnable_at, jiffies));
6151
6152         /* print everything onto one line to conserve console space */
6153         printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
6154                log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
6155                runnable_at_buf);
6156 }
6157
6158 static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr)
6159 {
6160         /*
6161          * SCX schedulers often have userspace components which are sometimes
6162          * involved in critial scheduling paths. PM operations involve freezing
6163          * userspace which can lead to scheduling misbehaviors including stalls.
6164          * Let's bypass while PM operations are in progress.
6165          */
6166         switch (event) {
6167         case PM_HIBERNATION_PREPARE:
6168         case PM_SUSPEND_PREPARE:
6169         case PM_RESTORE_PREPARE:
6170                 scx_ops_bypass(true);
6171                 break;
6172         case PM_POST_HIBERNATION:
6173         case PM_POST_SUSPEND:
6174         case PM_POST_RESTORE:
6175                 scx_ops_bypass(false);
6176                 break;
6177         }
6178
6179         return NOTIFY_OK;
6180 }
6181
6182 static struct notifier_block scx_pm_notifier = {
6183         .notifier_call = scx_pm_handler,
6184 };
6185
6186 void __init init_sched_ext_class(void)
6187 {
6188         s32 cpu, v;
6189
6190         /*
6191          * The following is to prevent the compiler from optimizing out the enum
6192          * definitions so that BPF scheduler implementations can use them
6193          * through the generated vmlinux.h.
6194          */
6195         WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
6196                    SCX_TG_ONLINE);
6197
6198         BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
6199 #ifdef CONFIG_SMP
6200         BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
6201         BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
6202 #endif
6203         scx_kick_cpus_pnt_seqs =
6204                 __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
6205                                __alignof__(scx_kick_cpus_pnt_seqs[0]));
6206         BUG_ON(!scx_kick_cpus_pnt_seqs);
6207
6208         for_each_possible_cpu(cpu) {
6209                 struct rq *rq = cpu_rq(cpu);
6210
6211                 init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
6212                 INIT_LIST_HEAD(&rq->scx.runnable_list);
6213                 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
6214
6215                 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
6216                 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
6217                 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
6218                 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
6219                 init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
6220                 init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
6221
6222                 if (cpu_online(cpu))
6223                         cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
6224         }
6225
6226         register_sysrq_key('S', &sysrq_sched_ext_reset_op);
6227         register_sysrq_key('D', &sysrq_sched_ext_dump_op);
6228         INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
6229 }
6230
6231
6232 /********************************************************************************
6233  * Helpers that can be called from the BPF scheduler.
6234  */
6235 #include <linux/btf_ids.h>
6236
6237 __bpf_kfunc_start_defs();
6238
6239 /**
6240  * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
6241  * @p: task_struct to select a CPU for
6242  * @prev_cpu: CPU @p was on previously
6243  * @wake_flags: %SCX_WAKE_* flags
6244  * @is_idle: out parameter indicating whether the returned CPU is idle
6245  *
6246  * Can only be called from ops.select_cpu() if the built-in CPU selection is
6247  * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
6248  * @p, @prev_cpu and @wake_flags match ops.select_cpu().
6249  *
6250  * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
6251  * currently idle and thus a good candidate for direct dispatching.
6252  */
6253 __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
6254                                        u64 wake_flags, bool *is_idle)
6255 {
6256         if (!static_branch_likely(&scx_builtin_idle_enabled)) {
6257                 scx_ops_error("built-in idle tracking is disabled");
6258                 goto prev_cpu;
6259         }
6260
6261         if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
6262                 goto prev_cpu;
6263
6264 #ifdef CONFIG_SMP
6265         return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
6266 #endif
6267
6268 prev_cpu:
6269         *is_idle = false;
6270         return prev_cpu;
6271 }
6272
6273 __bpf_kfunc_end_defs();
6274
6275 BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
6276 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
6277 BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
6278
6279 static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
6280         .owner                  = THIS_MODULE,
6281         .set                    = &scx_kfunc_ids_select_cpu,
6282 };
6283
6284 static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
6285 {
6286         if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
6287                 return false;
6288
6289         lockdep_assert_irqs_disabled();
6290
6291         if (unlikely(!p)) {
6292                 scx_ops_error("called with NULL task");
6293                 return false;
6294         }
6295
6296         if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
6297                 scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
6298                 return false;
6299         }
6300
6301         return true;
6302 }
6303
6304 static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
6305                                   u64 enq_flags)
6306 {
6307         struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
6308         struct task_struct *ddsp_task;
6309
6310         ddsp_task = __this_cpu_read(direct_dispatch_task);
6311         if (ddsp_task) {
6312                 mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
6313                 return;
6314         }
6315
6316         if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
6317                 scx_ops_error("dispatch buffer overflow");
6318                 return;
6319         }
6320
6321         dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
6322                 .task = p,
6323                 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
6324                 .dsq_id = dsq_id,
6325                 .enq_flags = enq_flags,
6326         };
6327 }
6328
6329 __bpf_kfunc_start_defs();
6330
6331 /**
6332  * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ
6333  * @p: task_struct to insert
6334  * @dsq_id: DSQ to insert into
6335  * @slice: duration @p can run for in nsecs, 0 to keep the current value
6336  * @enq_flags: SCX_ENQ_*
6337  *
6338  * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to
6339  * call this function spuriously. Can be called from ops.enqueue(),
6340  * ops.select_cpu(), and ops.dispatch().
6341  *
6342  * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
6343  * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be
6344  * used to target the local DSQ of a CPU other than the enqueueing one. Use
6345  * ops.select_cpu() to be on the target CPU in the first place.
6346  *
6347  * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
6348  * will be directly inserted into the corresponding dispatch queue after
6349  * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be
6350  * inserted into the local DSQ of the CPU returned by ops.select_cpu().
6351  * @enq_flags are OR'd with the enqueue flags on the enqueue path before the
6352  * task is inserted.
6353  *
6354  * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
6355  * and this function can be called upto ops.dispatch_max_batch times to insert
6356  * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
6357  * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
6358  *
6359  * This function doesn't have any locking restrictions and may be called under
6360  * BPF locks (in the future when BPF introduces more flexible locking).
6361  *
6362  * @p is allowed to run for @slice. The scheduling path is triggered on slice
6363  * exhaustion. If zero, the current residual slice is maintained. If
6364  * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
6365  * scx_bpf_kick_cpu() to trigger scheduling.
6366  */
6367 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
6368                                     u64 enq_flags)
6369 {
6370         if (!scx_dsq_insert_preamble(p, enq_flags))
6371                 return;
6372
6373         if (slice)
6374                 p->scx.slice = slice;
6375         else
6376                 p->scx.slice = p->scx.slice ?: 1;
6377
6378         scx_dsq_insert_commit(p, dsq_id, enq_flags);
6379 }
6380
6381 /* for backward compatibility, will be removed in v6.15 */
6382 __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
6383                                   u64 enq_flags)
6384 {
6385         printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()");
6386         scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);
6387 }
6388
6389 /**
6390  * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
6391  * @p: task_struct to insert
6392  * @dsq_id: DSQ to insert into
6393  * @slice: duration @p can run for in nsecs, 0 to keep the current value
6394  * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
6395  * @enq_flags: SCX_ENQ_*
6396  *
6397  * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id.
6398  * Tasks queued into the priority queue are ordered by @vtime. All other aspects
6399  * are identical to scx_bpf_dsq_insert().
6400  *
6401  * @vtime ordering is according to time_before64() which considers wrapping. A
6402  * numerically larger vtime may indicate an earlier position in the ordering and
6403  * vice-versa.
6404  *
6405  * A DSQ can only be used as a FIFO or priority queue at any given time and this
6406  * function must not be called on a DSQ which already has one or more FIFO tasks
6407  * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
6408  * SCX_DSQ_GLOBAL) cannot be used as priority queues.
6409  */
6410 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
6411                                           u64 slice, u64 vtime, u64 enq_flags)
6412 {
6413         if (!scx_dsq_insert_preamble(p, enq_flags))
6414                 return;
6415
6416         if (slice)
6417                 p->scx.slice = slice;
6418         else
6419                 p->scx.slice = p->scx.slice ?: 1;
6420
6421         p->scx.dsq_vtime = vtime;
6422
6423         scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
6424 }
6425
6426 /* for backward compatibility, will be removed in v6.15 */
6427 __bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
6428                                         u64 slice, u64 vtime, u64 enq_flags)
6429 {
6430         printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()");
6431         scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);
6432 }
6433
6434 __bpf_kfunc_end_defs();
6435
6436 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
6437 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
6438 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
6439 BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
6440 BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
6441 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
6442
6443 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
6444         .owner                  = THIS_MODULE,
6445         .set                    = &scx_kfunc_ids_enqueue_dispatch,
6446 };
6447
6448 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
6449                          struct task_struct *p, u64 dsq_id, u64 enq_flags)
6450 {
6451         struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
6452         struct rq *this_rq, *src_rq, *locked_rq;
6453         bool dispatched = false;
6454         bool in_balance;
6455         unsigned long flags;
6456
6457         if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
6458                 return false;
6459
6460         /*
6461          * Can be called from either ops.dispatch() locking this_rq() or any
6462          * context where no rq lock is held. If latter, lock @p's task_rq which
6463          * we'll likely need anyway.
6464          */
6465         src_rq = task_rq(p);
6466
6467         local_irq_save(flags);
6468         this_rq = this_rq();
6469         in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;
6470
6471         if (in_balance) {
6472                 if (this_rq != src_rq) {
6473                         raw_spin_rq_unlock(this_rq);
6474                         raw_spin_rq_lock(src_rq);
6475                 }
6476         } else {
6477                 raw_spin_rq_lock(src_rq);
6478         }
6479
6480         /*
6481          * If the BPF scheduler keeps calling this function repeatedly, it can
6482          * cause similar live-lock conditions as consume_dispatch_q(). Insert a
6483          * breather if necessary.
6484          */
6485         scx_ops_breather(src_rq);
6486
6487         locked_rq = src_rq;
6488         raw_spin_lock(&src_dsq->lock);
6489
6490         /*
6491          * Did someone else get to it? @p could have already left $src_dsq, got
6492          * re-enqueud, or be in the process of being consumed by someone else.
6493          */
6494         if (unlikely(p->scx.dsq != src_dsq ||
6495                      u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
6496                      p->scx.holding_cpu >= 0) ||
6497             WARN_ON_ONCE(src_rq != task_rq(p))) {
6498                 raw_spin_unlock(&src_dsq->lock);
6499                 goto out;
6500         }
6501
6502         /* @p is still on $src_dsq and stable, determine the destination */
6503         dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
6504
6505         /*
6506          * Apply vtime and slice updates before moving so that the new time is
6507          * visible before inserting into $dst_dsq. @p is still on $src_dsq but
6508          * this is safe as we're locking it.
6509          */
6510         if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
6511                 p->scx.dsq_vtime = kit->vtime;
6512         if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
6513                 p->scx.slice = kit->slice;
6514
6515         /* execute move */
6516         locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq);
6517         dispatched = true;
6518 out:
6519         if (in_balance) {
6520                 if (this_rq != locked_rq) {
6521                         raw_spin_rq_unlock(locked_rq);
6522                         raw_spin_rq_lock(this_rq);
6523                 }
6524         } else {
6525                 raw_spin_rq_unlock_irqrestore(locked_rq, flags);
6526         }
6527
6528         kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |
6529                                __SCX_DSQ_ITER_HAS_VTIME);
6530         return dispatched;
6531 }
6532
6533 __bpf_kfunc_start_defs();
6534
6535 /**
6536  * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
6537  *
6538  * Can only be called from ops.dispatch().
6539  */
6540 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
6541 {
6542         if (!scx_kf_allowed(SCX_KF_DISPATCH))
6543                 return 0;
6544
6545         return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
6546 }
6547
6548 /**
6549  * scx_bpf_dispatch_cancel - Cancel the latest dispatch
6550  *
6551  * Cancel the latest dispatch. Can be called multiple times to cancel further
6552  * dispatches. Can only be called from ops.dispatch().
6553  */
6554 __bpf_kfunc void scx_bpf_dispatch_cancel(void)
6555 {
6556         struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
6557
6558         if (!scx_kf_allowed(SCX_KF_DISPATCH))
6559                 return;
6560
6561         if (dspc->cursor > 0)
6562                 dspc->cursor--;
6563         else
6564                 scx_ops_error("dispatch buffer underflow");
6565 }
6566
6567 /**
6568  * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
6569  * @dsq_id: DSQ to move task from
6570  *
6571  * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
6572  * local DSQ for execution. Can only be called from ops.dispatch().
6573  *
6574  * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
6575  * before trying to move from the specified DSQ. It may also grab rq locks and
6576  * thus can't be called under any BPF locks.
6577  *
6578  * Returns %true if a task has been moved, %false if there isn't any task to
6579  * move.
6580  */
6581 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
6582 {
6583         struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
6584         struct scx_dispatch_q *dsq;
6585
6586         if (!scx_kf_allowed(SCX_KF_DISPATCH))
6587                 return false;
6588
6589         flush_dispatch_buf(dspc->rq);
6590
6591         dsq = find_user_dsq(dsq_id);
6592         if (unlikely(!dsq)) {
6593                 scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
6594                 return false;
6595         }
6596
6597         if (consume_dispatch_q(dspc->rq, dsq)) {
6598                 /*
6599                  * A successfully consumed task can be dequeued before it starts
6600                  * running while the CPU is trying to migrate other dispatched
6601                  * tasks. Bump nr_tasks to tell balance_scx() to retry on empty
6602                  * local DSQ.
6603                  */
6604                 dspc->nr_tasks++;
6605                 return true;
6606         } else {
6607                 return false;
6608         }
6609 }
6610
6611 /* for backward compatibility, will be removed in v6.15 */
6612 __bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
6613 {
6614         printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()");
6615         return scx_bpf_dsq_move_to_local(dsq_id);
6616 }
6617
6618 /**
6619  * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
6620  * @it__iter: DSQ iterator in progress
6621  * @slice: duration the moved task can run for in nsecs
6622  *
6623  * Override the slice of the next task that will be moved from @it__iter using
6624  * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous
6625  * slice duration is kept.
6626  */
6627 __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
6628                                             u64 slice)
6629 {
6630         struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
6631
6632         kit->slice = slice;
6633         kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
6634 }
6635
6636 /* for backward compatibility, will be removed in v6.15 */
6637 __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
6638                         struct bpf_iter_scx_dsq *it__iter, u64 slice)
6639 {
6640         printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()");
6641         scx_bpf_dsq_move_set_slice(it__iter, slice);
6642 }
6643
6644 /**
6645  * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
6646  * @it__iter: DSQ iterator in progress
6647  * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
6648  *
6649  * Override the vtime of the next task that will be moved from @it__iter using
6650  * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice
6651  * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the
6652  * override is ignored and cleared.
6653  */
6654 __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
6655                                             u64 vtime)
6656 {
6657         struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
6658
6659         kit->vtime = vtime;
6660         kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
6661 }
6662
6663 /* for backward compatibility, will be removed in v6.15 */
6664 __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
6665                         struct bpf_iter_scx_dsq *it__iter, u64 vtime)
6666 {
6667         printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()");
6668         scx_bpf_dsq_move_set_vtime(it__iter, vtime);
6669 }
6670
6671 /**
6672  * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
6673  * @it__iter: DSQ iterator in progress
6674  * @p: task to transfer
6675  * @dsq_id: DSQ to move @p to
6676  * @enq_flags: SCX_ENQ_*
6677  *
6678  * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ
6679  * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
6680  * be the destination.
6681  *
6682  * For the transfer to be successful, @p must still be on the DSQ and have been
6683  * queued before the DSQ iteration started. This function doesn't care whether
6684  * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
6685  * been queued before the iteration started.
6686  *
6687  * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update.
6688  *
6689  * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
6690  * lock (e.g. BPF timers or SYSCALL programs).
6691  *
6692  * Returns %true if @p has been consumed, %false if @p had already been consumed
6693  * or dequeued.
6694  */
6695 __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
6696                                   struct task_struct *p, u64 dsq_id,
6697                                   u64 enq_flags)
6698 {
6699         return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
6700                             p, dsq_id, enq_flags);
6701 }
6702
6703 /* for backward compatibility, will be removed in v6.15 */
6704 __bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
6705                                            struct task_struct *p, u64 dsq_id,
6706                                            u64 enq_flags)
6707 {
6708         printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()");
6709         return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags);
6710 }
6711
6712 /**
6713  * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
6714  * @it__iter: DSQ iterator in progress
6715  * @p: task to transfer
6716  * @dsq_id: DSQ to move @p to
6717  * @enq_flags: SCX_ENQ_*
6718  *
6719  * Transfer @p which is on the DSQ currently iterated by @it__iter to the
6720  * priority queue of the DSQ specified by @dsq_id. The destination must be a
6721  * user DSQ as only user DSQs support priority queue.
6722  *
6723  * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice()
6724  * and scx_bpf_dsq_move_set_vtime() to update.
6725  *
6726  * All other aspects are identical to scx_bpf_dsq_move(). See
6727  * scx_bpf_dsq_insert_vtime() for more information on @vtime.
6728  */
6729 __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
6730                                         struct task_struct *p, u64 dsq_id,
6731                                         u64 enq_flags)
6732 {
6733         return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
6734                             p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
6735 }
6736
6737 /* for backward compatibility, will be removed in v6.15 */
6738 __bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
6739                                                  struct task_struct *p, u64 dsq_id,
6740                                                  u64 enq_flags)
6741 {
6742         printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()");
6743         return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags);
6744 }
6745
6746 __bpf_kfunc_end_defs();
6747
6748 BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
6749 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
6750 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
6751 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
6752 BTF_ID_FLAGS(func, scx_bpf_consume)
6753 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
6754 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
6755 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
6756 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
6757 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
6758 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
6759 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
6760 BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
6761 BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
6762
6763 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
6764         .owner                  = THIS_MODULE,
6765         .set                    = &scx_kfunc_ids_dispatch,
6766 };
6767
6768 __bpf_kfunc_start_defs();
6769
6770 /**
6771  * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
6772  *
6773  * Iterate over all of the tasks currently enqueued on the local DSQ of the
6774  * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
6775  * processed tasks. Can only be called from ops.cpu_release().
6776  */
6777 __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
6778 {
6779         LIST_HEAD(tasks);
6780         u32 nr_enqueued = 0;
6781         struct rq *rq;
6782         struct task_struct *p, *n;
6783
6784         if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
6785                 return 0;
6786
6787         rq = cpu_rq(smp_processor_id());
6788         lockdep_assert_rq_held(rq);
6789
6790         /*
6791          * The BPF scheduler may choose to dispatch tasks back to
6792          * @rq->scx.local_dsq. Move all candidate tasks off to a private list
6793          * first to avoid processing the same tasks repeatedly.
6794          */
6795         list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
6796                                  scx.dsq_list.node) {
6797                 /*
6798                  * If @p is being migrated, @p's current CPU may not agree with
6799                  * its allowed CPUs and the migration_cpu_stop is about to
6800                  * deactivate and re-activate @p anyway. Skip re-enqueueing.
6801                  *
6802                  * While racing sched property changes may also dequeue and
6803                  * re-enqueue a migrating task while its current CPU and allowed
6804                  * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
6805                  * the current local DSQ for running tasks and thus are not
6806                  * visible to the BPF scheduler.
6807                  */
6808                 if (p->migration_pending)
6809                         continue;
6810
6811                 dispatch_dequeue(rq, p);
6812                 list_add_tail(&p->scx.dsq_list.node, &tasks);
6813         }
6814
6815         list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
6816                 list_del_init(&p->scx.dsq_list.node);
6817                 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
6818                 nr_enqueued++;
6819         }
6820
6821         return nr_enqueued;
6822 }
6823
6824 __bpf_kfunc_end_defs();
6825
6826 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
6827 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
6828 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
6829
6830 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
6831         .owner                  = THIS_MODULE,
6832         .set                    = &scx_kfunc_ids_cpu_release,
6833 };
6834
6835 __bpf_kfunc_start_defs();
6836
6837 /**
6838  * scx_bpf_create_dsq - Create a custom DSQ
6839  * @dsq_id: DSQ to create
6840  * @node: NUMA node to allocate from
6841  *
6842  * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
6843  * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
6844  */
6845 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
6846 {
6847         if (unlikely(node >= (int)nr_node_ids ||
6848                      (node < 0 && node != NUMA_NO_NODE)))
6849                 return -EINVAL;
6850         return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
6851 }
6852
6853 __bpf_kfunc_end_defs();
6854
6855 BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
6856 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
6857 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
6858 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
6859 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
6860 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
6861 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
6862 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
6863 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
6864 BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
6865 BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
6866
6867 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
6868         .owner                  = THIS_MODULE,
6869         .set                    = &scx_kfunc_ids_unlocked,
6870 };
6871
6872 __bpf_kfunc_start_defs();
6873
6874 /**
6875  * scx_bpf_kick_cpu - Trigger reschedule on a CPU
6876  * @cpu: cpu to kick
6877  * @flags: %SCX_KICK_* flags
6878  *
6879  * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
6880  * trigger rescheduling on a busy CPU. This can be called from any online
6881  * scx_ops operation and the actual kicking is performed asynchronously through
6882  * an irq work.
6883  */
6884 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
6885 {
6886         struct rq *this_rq;
6887         unsigned long irq_flags;
6888
6889         if (!ops_cpu_valid(cpu, NULL))
6890                 return;
6891
6892         local_irq_save(irq_flags);
6893
6894         this_rq = this_rq();
6895
6896         /*
6897          * While bypassing for PM ops, IRQ handling may not be online which can
6898          * lead to irq_work_queue() malfunction such as infinite busy wait for
6899          * IRQ status update. Suppress kicking.
6900          */
6901         if (scx_rq_bypassing(this_rq))
6902                 goto out;
6903
6904         /*
6905          * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
6906          * rq locks. We can probably be smarter and avoid bouncing if called
6907          * from ops which don't hold a rq lock.
6908          */
6909         if (flags & SCX_KICK_IDLE) {
6910                 struct rq *target_rq = cpu_rq(cpu);
6911
6912                 if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
6913                         scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
6914
6915                 if (raw_spin_rq_trylock(target_rq)) {
6916                         if (can_skip_idle_kick(target_rq)) {
6917                                 raw_spin_rq_unlock(target_rq);
6918                                 goto out;
6919                         }
6920                         raw_spin_rq_unlock(target_rq);
6921                 }
6922                 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
6923         } else {
6924                 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
6925
6926                 if (flags & SCX_KICK_PREEMPT)
6927                         cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
6928                 if (flags & SCX_KICK_WAIT)
6929                         cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
6930         }
6931
6932         irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
6933 out:
6934         local_irq_restore(irq_flags);
6935 }
6936
6937 /**
6938  * scx_bpf_dsq_nr_queued - Return the number of queued tasks
6939  * @dsq_id: id of the DSQ
6940  *
6941  * Return the number of tasks in the DSQ matching @dsq_id. If not found,
6942  * -%ENOENT is returned.
6943  */
6944 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
6945 {
6946         struct scx_dispatch_q *dsq;
6947         s32 ret;
6948
6949         preempt_disable();
6950
6951         if (dsq_id == SCX_DSQ_LOCAL) {
6952                 ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
6953                 goto out;
6954         } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
6955                 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
6956
6957                 if (ops_cpu_valid(cpu, NULL)) {
6958                         ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
6959                         goto out;
6960                 }
6961         } else {
6962                 dsq = find_user_dsq(dsq_id);
6963                 if (dsq) {
6964                         ret = READ_ONCE(dsq->nr);
6965                         goto out;
6966                 }
6967         }
6968         ret = -ENOENT;
6969 out:
6970         preempt_enable();
6971         return ret;
6972 }
6973
6974 /**
6975  * scx_bpf_destroy_dsq - Destroy a custom DSQ
6976  * @dsq_id: DSQ to destroy
6977  *
6978  * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
6979  * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
6980  * empty and no further tasks are dispatched to it. Ignored if called on a DSQ
6981  * which doesn't exist. Can be called from any online scx_ops operations.
6982  */
6983 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
6984 {
6985         destroy_dsq(dsq_id);
6986 }
6987
6988 /**
6989  * bpf_iter_scx_dsq_new - Create a DSQ iterator
6990  * @it: iterator to initialize
6991  * @dsq_id: DSQ to iterate
6992  * @flags: %SCX_DSQ_ITER_*
6993  *
6994  * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
6995  * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
6996  * tasks which are already queued when this function is invoked.
6997  */
6998 __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
6999                                      u64 flags)
7000 {
7001         struct bpf_iter_scx_dsq_kern *kit = (void *)it;
7002
7003         BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
7004                      sizeof(struct bpf_iter_scx_dsq));
7005         BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
7006                      __alignof__(struct bpf_iter_scx_dsq));
7007
7008         if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
7009                 return -EINVAL;
7010
7011         kit->dsq = find_user_dsq(dsq_id);
7012         if (!kit->dsq)
7013                 return -ENOENT;
7014
7015         INIT_LIST_HEAD(&kit->cursor.node);
7016         kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags;
7017         kit->cursor.priv = READ_ONCE(kit->dsq->seq);
7018
7019         return 0;
7020 }
7021
7022 /**
7023  * bpf_iter_scx_dsq_next - Progress a DSQ iterator
7024  * @it: iterator to progress
7025  *
7026  * Return the next task. See bpf_iter_scx_dsq_new().
7027  */
7028 __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
7029 {
7030         struct bpf_iter_scx_dsq_kern *kit = (void *)it;
7031         bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV;
7032         struct task_struct *p;
7033         unsigned long flags;
7034
7035         if (!kit->dsq)
7036                 return NULL;
7037
7038         raw_spin_lock_irqsave(&kit->dsq->lock, flags);
7039
7040         if (list_empty(&kit->cursor.node))
7041                 p = NULL;
7042         else
7043                 p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);
7044
7045         /*
7046          * Only tasks which were queued before the iteration started are
7047          * visible. This bounds BPF iterations and guarantees that vtime never
7048          * jumps in the other direction while iterating.
7049          */
7050         do {
7051                 p = nldsq_next_task(kit->dsq, p, rev);
7052         } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));
7053
7054         if (p) {
7055                 if (rev)
7056                         list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node);
7057                 else
7058                         list_move(&kit->cursor.node, &p->scx.dsq_list.node);
7059         } else {
7060                 list_del_init(&kit->cursor.node);
7061         }
7062
7063         raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
7064
7065         return p;
7066 }
7067
7068 /**
7069  * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
7070  * @it: iterator to destroy
7071  *
7072  * Undo scx_iter_scx_dsq_new().
7073  */
7074 __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
7075 {
7076         struct bpf_iter_scx_dsq_kern *kit = (void *)it;
7077
7078         if (!kit->dsq)
7079                 return;
7080
7081         if (!list_empty(&kit->cursor.node)) {
7082                 unsigned long flags;
7083
7084                 raw_spin_lock_irqsave(&kit->dsq->lock, flags);
7085                 list_del_init(&kit->cursor.node);
7086                 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
7087         }
7088         kit->dsq = NULL;
7089 }
7090
7091 __bpf_kfunc_end_defs();
7092
7093 static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
7094                          char *fmt, unsigned long long *data, u32 data__sz)
7095 {
7096         struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
7097         s32 ret;
7098
7099         if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
7100             (data__sz && !data)) {
7101                 scx_ops_error("invalid data=%p and data__sz=%u",
7102                               (void *)data, data__sz);
7103                 return -EINVAL;
7104         }
7105
7106         ret = copy_from_kernel_nofault(data_buf, data, data__sz);
7107         if (ret < 0) {
7108                 scx_ops_error("failed to read data fields (%d)", ret);
7109                 return ret;
7110         }
7111
7112         ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
7113                                   &bprintf_data);
7114         if (ret < 0) {
7115                 scx_ops_error("format preparation failed (%d)", ret);
7116                 return ret;
7117         }
7118
7119         ret = bstr_printf(line_buf, line_size, fmt,
7120                           bprintf_data.bin_args);
7121         bpf_bprintf_cleanup(&bprintf_data);
7122         if (ret < 0) {
7123                 scx_ops_error("(\"%s\", %p, %u) failed to format",
7124                               fmt, data, data__sz);
7125                 return ret;
7126         }
7127
7128         return ret;
7129 }
7130
7131 static s32 bstr_format(struct scx_bstr_buf *buf,
7132                        char *fmt, unsigned long long *data, u32 data__sz)
7133 {
7134         return __bstr_format(buf->data, buf->line, sizeof(buf->line),
7135                              fmt, data, data__sz);
7136 }
7137
7138 __bpf_kfunc_start_defs();
7139
7140 /**
7141  * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
7142  * @exit_code: Exit value to pass to user space via struct scx_exit_info.
7143  * @fmt: error message format string
7144  * @data: format string parameters packaged using ___bpf_fill() macro
7145  * @data__sz: @data len, must end in '__sz' for the verifier
7146  *
7147  * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
7148  * disabling.
7149  */
7150 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
7151                                    unsigned long long *data, u32 data__sz)
7152 {
7153         unsigned long flags;
7154
7155         raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
7156         if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
7157                 scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s",
7158                                   scx_exit_bstr_buf.line);
7159         raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
7160 }
7161
7162 /**
7163  * scx_bpf_error_bstr - Indicate fatal error
7164  * @fmt: error message format string
7165  * @data: format string parameters packaged using ___bpf_fill() macro
7166  * @data__sz: @data len, must end in '__sz' for the verifier
7167  *
7168  * Indicate that the BPF scheduler encountered a fatal error and initiate ops
7169  * disabling.
7170  */
7171 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
7172                                     u32 data__sz)
7173 {
7174         unsigned long flags;
7175
7176         raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
7177         if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
7178                 scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s",
7179                                   scx_exit_bstr_buf.line);
7180         raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
7181 }
7182
7183 /**
7184  * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler
7185  * @fmt: format string
7186  * @data: format string parameters packaged using ___bpf_fill() macro
7187  * @data__sz: @data len, must end in '__sz' for the verifier
7188  *
7189  * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
7190  * dump_task() to generate extra debug dump specific to the BPF scheduler.
7191  *
7192  * The extra dump may be multiple lines. A single line may be split over
7193  * multiple calls. The last line is automatically terminated.
7194  */
7195 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
7196                                    u32 data__sz)
7197 {
7198         struct scx_dump_data *dd = &scx_dump_data;
7199         struct scx_bstr_buf *buf = &dd->buf;
7200         s32 ret;
7201
7202         if (raw_smp_processor_id() != dd->cpu) {
7203                 scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
7204                 return;
7205         }
7206
7207         /* append the formatted string to the line buf */
7208         ret = __bstr_format(buf->data, buf->line + dd->cursor,
7209                             sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
7210         if (ret < 0) {
7211                 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
7212                           dd->prefix, fmt, data, data__sz, ret);
7213                 return;
7214         }
7215
7216         dd->cursor += ret;
7217         dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
7218
7219         if (!dd->cursor)
7220                 return;
7221
7222         /*
7223          * If the line buf overflowed or ends in a newline, flush it into the
7224          * dump. This is to allow the caller to generate a single line over
7225          * multiple calls. As ops_dump_flush() can also handle multiple lines in
7226          * the line buf, the only case which can lead to an unexpected
7227          * truncation is when the caller keeps generating newlines in the middle
7228          * instead of the end consecutively. Don't do that.
7229          */
7230         if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
7231                 ops_dump_flush();
7232 }
7233
7234 /**
7235  * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
7236  * @cpu: CPU of interest
7237  *
7238  * Return the maximum relative capacity of @cpu in relation to the most
7239  * performant CPU in the system. The return value is in the range [1,
7240  * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
7241  */
7242 __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
7243 {
7244         if (ops_cpu_valid(cpu, NULL))
7245                 return arch_scale_cpu_capacity(cpu);
7246         else
7247                 return SCX_CPUPERF_ONE;
7248 }
7249
7250 /**
7251  * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
7252  * @cpu: CPU of interest
7253  *
7254  * Return the current relative performance of @cpu in relation to its maximum.
7255  * The return value is in the range [1, %SCX_CPUPERF_ONE].
7256  *
7257  * The current performance level of a CPU in relation to the maximum performance
7258  * available in the system can be calculated as follows:
7259  *
7260  *   scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE
7261  *
7262  * The result is in the range [1, %SCX_CPUPERF_ONE].
7263  */
7264 __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
7265 {
7266         if (ops_cpu_valid(cpu, NULL))
7267                 return arch_scale_freq_capacity(cpu);
7268         else
7269                 return SCX_CPUPERF_ONE;
7270 }
7271
7272 /**
7273  * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
7274  * @cpu: CPU of interest
7275  * @perf: target performance level [0, %SCX_CPUPERF_ONE]
7276  * @flags: %SCX_CPUPERF_* flags
7277  *
7278  * Set the target performance level of @cpu to @perf. @perf is in linear
7279  * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
7280  * schedutil cpufreq governor chooses the target frequency.
7281  *
7282  * The actual performance level chosen, CPU grouping, and the overhead and
7283  * latency of the operations are dependent on the hardware and cpufreq driver in
7284  * use. Consult hardware and cpufreq documentation for more information. The
7285  * current performance level can be monitored using scx_bpf_cpuperf_cur().
7286  */
7287 __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
7288 {
7289         if (unlikely(perf > SCX_CPUPERF_ONE)) {
7290                 scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
7291                 return;
7292         }
7293
7294         if (ops_cpu_valid(cpu, NULL)) {
7295                 struct rq *rq = cpu_rq(cpu);
7296
7297                 rq->scx.cpuperf_target = perf;
7298
7299                 rcu_read_lock_sched_notrace();
7300                 cpufreq_update_util(cpu_rq(cpu), 0);
7301                 rcu_read_unlock_sched_notrace();
7302         }
7303 }
7304
7305 /**
7306  * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
7307  *
7308  * All valid CPU IDs in the system are smaller than the returned value.
7309  */
7310 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void)
7311 {
7312         return nr_cpu_ids;
7313 }
7314
7315 /**
7316  * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
7317  */
7318 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void)
7319 {
7320         return cpu_possible_mask;
7321 }
7322
7323 /**
7324  * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
7325  */
7326 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void)
7327 {
7328         return cpu_online_mask;
7329 }
7330
7331 /**
7332  * scx_bpf_put_cpumask - Release a possible/online cpumask
7333  * @cpumask: cpumask to release
7334  */
7335 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
7336 {
7337         /*
7338          * Empty function body because we aren't actually acquiring or releasing
7339          * a reference to a global cpumask, which is read-only in the caller and
7340          * is never released. The acquire / release semantics here are just used
7341          * to make the cpumask is a trusted pointer in the caller.
7342          */
7343 }
7344
7345 /**
7346  * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
7347  * per-CPU cpumask.
7348  *
7349  * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
7350  */
7351 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
7352 {
7353         if (!static_branch_likely(&scx_builtin_idle_enabled)) {
7354                 scx_ops_error("built-in idle tracking is disabled");
7355                 return cpu_none_mask;
7356         }
7357
7358 #ifdef CONFIG_SMP
7359         return idle_masks.cpu;
7360 #else
7361         return cpu_none_mask;
7362 #endif
7363 }
7364
7365 /**
7366  * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
7367  * per-physical-core cpumask. Can be used to determine if an entire physical
7368  * core is free.
7369  *
7370  * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
7371  */
7372 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
7373 {
7374         if (!static_branch_likely(&scx_builtin_idle_enabled)) {
7375                 scx_ops_error("built-in idle tracking is disabled");
7376                 return cpu_none_mask;
7377         }
7378
7379 #ifdef CONFIG_SMP
7380         if (sched_smt_active())
7381                 return idle_masks.smt;
7382         else
7383                 return idle_masks.cpu;
7384 #else
7385         return cpu_none_mask;
7386 #endif
7387 }
7388
7389 /**
7390  * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
7391  * either the percpu, or SMT idle-tracking cpumask.
7392  */
7393 __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
7394 {
7395         /*
7396          * Empty function body because we aren't actually acquiring or releasing
7397          * a reference to a global idle cpumask, which is read-only in the
7398          * caller and is never released. The acquire / release semantics here
7399          * are just used to make the cpumask a trusted pointer in the caller.
7400          */
7401 }
7402
7403 /**
7404  * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
7405  * @cpu: cpu to test and clear idle for
7406  *
7407  * Returns %true if @cpu was idle and its idle state was successfully cleared.
7408  * %false otherwise.
7409  *
7410  * Unavailable if ops.update_idle() is implemented and
7411  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
7412  */
7413 __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
7414 {
7415         if (!static_branch_likely(&scx_builtin_idle_enabled)) {
7416                 scx_ops_error("built-in idle tracking is disabled");
7417                 return false;
7418         }
7419
7420         if (ops_cpu_valid(cpu, NULL))
7421                 return test_and_clear_cpu_idle(cpu);
7422         else
7423                 return false;
7424 }
7425
7426 /**
7427  * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
7428  * @cpus_allowed: Allowed cpumask
7429  * @flags: %SCX_PICK_IDLE_CPU_* flags
7430  *
7431  * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
7432  * number on success. -%EBUSY if no matching cpu was found.
7433  *
7434  * Idle CPU tracking may race against CPU scheduling state transitions. For
7435  * example, this function may return -%EBUSY as CPUs are transitioning into the
7436  * idle state. If the caller then assumes that there will be dispatch events on
7437  * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
7438  * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
7439  * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
7440  * event in the near future.
7441  *
7442  * Unavailable if ops.update_idle() is implemented and
7443  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
7444  */
7445 __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
7446                                       u64 flags)
7447 {
7448         if (!static_branch_likely(&scx_builtin_idle_enabled)) {
7449                 scx_ops_error("built-in idle tracking is disabled");
7450                 return -EBUSY;
7451         }
7452
7453         return scx_pick_idle_cpu(cpus_allowed, flags);
7454 }
7455
7456 /**
7457  * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
7458  * @cpus_allowed: Allowed cpumask
7459  * @flags: %SCX_PICK_IDLE_CPU_* flags
7460  *
7461  * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
7462  * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
7463  * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
7464  * empty.
7465  *
7466  * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
7467  * set, this function can't tell which CPUs are idle and will always pick any
7468  * CPU.
7469  */
7470 __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
7471                                      u64 flags)
7472 {
7473         s32 cpu;
7474
7475         if (static_branch_likely(&scx_builtin_idle_enabled)) {
7476                 cpu = scx_pick_idle_cpu(cpus_allowed, flags);
7477                 if (cpu >= 0)
7478                         return cpu;
7479         }
7480
7481         cpu = cpumask_any_distribute(cpus_allowed);
7482         if (cpu < nr_cpu_ids)
7483                 return cpu;
7484         else
7485                 return -EBUSY;
7486 }
7487
7488 /**
7489  * scx_bpf_task_running - Is task currently running?
7490  * @p: task of interest
7491  */
7492 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p)
7493 {
7494         return task_rq(p)->curr == p;
7495 }
7496
7497 /**
7498  * scx_bpf_task_cpu - CPU a task is currently associated with
7499  * @p: task of interest
7500  */
7501 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
7502 {
7503         return task_cpu(p);
7504 }
7505
7506 /**
7507  * scx_bpf_cpu_rq - Fetch the rq of a CPU
7508  * @cpu: CPU of the rq
7509  */
7510 __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
7511 {
7512         if (!ops_cpu_valid(cpu, NULL))
7513                 return NULL;
7514
7515         return cpu_rq(cpu);
7516 }
7517
7518 /**
7519  * scx_bpf_task_cgroup - Return the sched cgroup of a task
7520  * @p: task of interest
7521  *
7522  * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
7523  * from the scheduler's POV. SCX operations should use this function to
7524  * determine @p's current cgroup as, unlike following @p->cgroups,
7525  * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
7526  * rq-locked operations. Can be called on the parameter tasks of rq-locked
7527  * operations. The restriction guarantees that @p's rq is locked by the caller.
7528  */
7529 #ifdef CONFIG_CGROUP_SCHED
7530 __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
7531 {
7532         struct task_group *tg = p->sched_task_group;
7533         struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
7534
7535         if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
7536                 goto out;
7537
7538         cgrp = tg_cgrp(tg);
7539
7540 out:
7541         cgroup_get(cgrp);
7542         return cgrp;
7543 }
7544 #endif
7545
7546 __bpf_kfunc_end_defs();
7547
7548 BTF_KFUNCS_START(scx_kfunc_ids_any)
7549 BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
7550 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
7551 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
7552 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
7553 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
7554 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
7555 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
7556 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
7557 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
7558 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
7559 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
7560 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
7561 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
7562 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
7563 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
7564 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
7565 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
7566 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
7567 BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
7568 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
7569 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
7570 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
7571 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
7572 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
7573 BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
7574 #ifdef CONFIG_CGROUP_SCHED
7575 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
7576 #endif
7577 BTF_KFUNCS_END(scx_kfunc_ids_any)
7578
7579 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
7580         .owner                  = THIS_MODULE,
7581         .set                    = &scx_kfunc_ids_any,
7582 };
7583
7584 static int __init scx_init(void)
7585 {
7586         int ret;
7587
7588         /*
7589          * kfunc registration can't be done from init_sched_ext_class() as
7590          * register_btf_kfunc_id_set() needs most of the system to be up.
7591          *
7592          * Some kfuncs are context-sensitive and can only be called from
7593          * specific SCX ops. They are grouped into BTF sets accordingly.
7594          * Unfortunately, BPF currently doesn't have a way of enforcing such
7595          * restrictions. Eventually, the verifier should be able to enforce
7596          * them. For now, register them the same and make each kfunc explicitly
7597          * check using scx_kf_allowed().
7598          */
7599         if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
7600                                              &scx_kfunc_set_select_cpu)) ||
7601             (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
7602                                              &scx_kfunc_set_enqueue_dispatch)) ||
7603             (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
7604                                              &scx_kfunc_set_dispatch)) ||
7605             (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
7606                                              &scx_kfunc_set_cpu_release)) ||
7607             (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
7608                                              &scx_kfunc_set_unlocked)) ||
7609             (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
7610                                              &scx_kfunc_set_unlocked)) ||
7611             (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
7612                                              &scx_kfunc_set_any)) ||
7613             (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
7614                                              &scx_kfunc_set_any)) ||
7615             (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
7616                                              &scx_kfunc_set_any))) {
7617                 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret);
7618                 return ret;
7619         }
7620
7621         ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
7622         if (ret) {
7623                 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
7624                 return ret;
7625         }
7626
7627         ret = register_pm_notifier(&scx_pm_notifier);
7628         if (ret) {
7629                 pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret);
7630                 return ret;
7631         }
7632
7633         scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj);
7634         if (!scx_kset) {
7635                 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n");
7636                 return -ENOMEM;
7637         }
7638
7639         ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);
7640         if (ret < 0) {
7641                 pr_err("sched_ext: Failed to add global attributes\n");
7642                 return ret;
7643         }
7644
7645         return 0;
7646 }
7647 __initcall(scx_init);