2 * Copyright(c) 2015 - 2020 Intel Corporation.
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
24 * - Redistributions of source code must retain the above copyright
25 * notice, this list of conditions and the following disclaimer.
26 * - Redistributions in binary form must reproduce the above copyright
27 * notice, this list of conditions and the following disclaimer in
28 * the documentation and/or other materials provided with the
30 * - Neither the name of Intel Corporation nor the names of its
31 * contributors may be used to endorse or promote products derived
32 * from this software without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
47 #include <linux/topology.h>
48 #include <linux/cpumask.h>
49 #include <linux/module.h>
50 #include <linux/interrupt.h>
51 #include <linux/numa.h>
58 struct hfi1_affinity_node_list node_affinity
= {
59 .list
= LIST_HEAD_INIT(node_affinity
.list
),
60 .lock
= __MUTEX_INITIALIZER(node_affinity
.lock
)
63 /* Name of IRQ types, indexed by enum irq_type */
64 static const char * const irq_type_names
[] = {
72 /* Per NUMA node count of HFI devices */
73 static unsigned int *hfi1_per_node_cntr
;
75 static inline void init_cpu_mask_set(struct cpu_mask_set
*set
)
77 cpumask_clear(&set
->mask
);
78 cpumask_clear(&set
->used
);
82 /* Increment generation of CPU set if needed */
83 static void _cpu_mask_set_gen_inc(struct cpu_mask_set
*set
)
85 if (cpumask_equal(&set
->mask
, &set
->used
)) {
87 * We've used up all the CPUs, bump up the generation
88 * and reset the 'used' map
91 cpumask_clear(&set
->used
);
95 static void _cpu_mask_set_gen_dec(struct cpu_mask_set
*set
)
97 if (cpumask_empty(&set
->used
) && set
->gen
) {
99 cpumask_copy(&set
->used
, &set
->mask
);
103 /* Get the first CPU from the list of unused CPUs in a CPU set data structure */
104 static int cpu_mask_set_get_first(struct cpu_mask_set
*set
, cpumask_var_t diff
)
111 _cpu_mask_set_gen_inc(set
);
113 /* Find out CPUs left in CPU mask */
114 cpumask_andnot(diff
, &set
->mask
, &set
->used
);
116 cpu
= cpumask_first(diff
);
117 if (cpu
>= nr_cpu_ids
) /* empty */
120 cpumask_set_cpu(cpu
, &set
->used
);
125 static void cpu_mask_set_put(struct cpu_mask_set
*set
, int cpu
)
130 cpumask_clear_cpu(cpu
, &set
->used
);
131 _cpu_mask_set_gen_dec(set
);
134 /* Initialize non-HT cpu cores mask */
135 void init_real_cpu_mask(void)
137 int possible
, curr_cpu
, i
, ht
;
139 cpumask_clear(&node_affinity
.real_cpu_mask
);
141 /* Start with cpu online mask as the real cpu mask */
142 cpumask_copy(&node_affinity
.real_cpu_mask
, cpu_online_mask
);
145 * Remove HT cores from the real cpu mask. Do this in two steps below.
147 possible
= cpumask_weight(&node_affinity
.real_cpu_mask
);
148 ht
= cpumask_weight(topology_sibling_cpumask(
149 cpumask_first(&node_affinity
.real_cpu_mask
)));
151 * Step 1. Skip over the first N HT siblings and use them as the
152 * "real" cores. Assumes that HT cores are not enumerated in
153 * succession (except in the single core case).
155 curr_cpu
= cpumask_first(&node_affinity
.real_cpu_mask
);
156 for (i
= 0; i
< possible
/ ht
; i
++)
157 curr_cpu
= cpumask_next(curr_cpu
, &node_affinity
.real_cpu_mask
);
159 * Step 2. Remove the remaining HT siblings. Use cpumask_next() to
162 for (; i
< possible
; i
++) {
163 cpumask_clear_cpu(curr_cpu
, &node_affinity
.real_cpu_mask
);
164 curr_cpu
= cpumask_next(curr_cpu
, &node_affinity
.real_cpu_mask
);
168 int node_affinity_init(void)
171 struct pci_dev
*dev
= NULL
;
172 const struct pci_device_id
*ids
= hfi1_pci_tbl
;
174 cpumask_clear(&node_affinity
.proc
.used
);
175 cpumask_copy(&node_affinity
.proc
.mask
, cpu_online_mask
);
177 node_affinity
.proc
.gen
= 0;
178 node_affinity
.num_core_siblings
=
179 cpumask_weight(topology_sibling_cpumask(
180 cpumask_first(&node_affinity
.proc
.mask
)
182 node_affinity
.num_possible_nodes
= num_possible_nodes();
183 node_affinity
.num_online_nodes
= num_online_nodes();
184 node_affinity
.num_online_cpus
= num_online_cpus();
187 * The real cpu mask is part of the affinity struct but it has to be
188 * initialized early. It is needed to calculate the number of user
189 * contexts in set_up_context_variables().
191 init_real_cpu_mask();
193 hfi1_per_node_cntr
= kcalloc(node_affinity
.num_possible_nodes
,
194 sizeof(*hfi1_per_node_cntr
), GFP_KERNEL
);
195 if (!hfi1_per_node_cntr
)
198 while (ids
->vendor
) {
200 while ((dev
= pci_get_device(ids
->vendor
, ids
->device
, dev
))) {
201 node
= pcibus_to_node(dev
->bus
);
205 hfi1_per_node_cntr
[node
]++;
214 * Invalid PCI NUMA node information found, note it, and populate
217 pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n");
218 pr_err("HFI: System BIOS may need to be upgraded\n");
219 for (node
= 0; node
< node_affinity
.num_possible_nodes
; node
++)
220 hfi1_per_node_cntr
[node
] = 1;
225 static void node_affinity_destroy(struct hfi1_affinity_node
*entry
)
227 free_percpu(entry
->comp_vect_affinity
);
231 void node_affinity_destroy_all(void)
233 struct list_head
*pos
, *q
;
234 struct hfi1_affinity_node
*entry
;
236 mutex_lock(&node_affinity
.lock
);
237 list_for_each_safe(pos
, q
, &node_affinity
.list
) {
238 entry
= list_entry(pos
, struct hfi1_affinity_node
,
241 node_affinity_destroy(entry
);
243 mutex_unlock(&node_affinity
.lock
);
244 kfree(hfi1_per_node_cntr
);
247 static struct hfi1_affinity_node
*node_affinity_allocate(int node
)
249 struct hfi1_affinity_node
*entry
;
251 entry
= kzalloc(sizeof(*entry
), GFP_KERNEL
);
255 entry
->comp_vect_affinity
= alloc_percpu(u16
);
256 INIT_LIST_HEAD(&entry
->list
);
262 * It appends an entry to the list.
263 * It *must* be called with node_affinity.lock held.
265 static void node_affinity_add_tail(struct hfi1_affinity_node
*entry
)
267 list_add_tail(&entry
->list
, &node_affinity
.list
);
270 /* It must be called with node_affinity.lock held */
271 static struct hfi1_affinity_node
*node_affinity_lookup(int node
)
273 struct list_head
*pos
;
274 struct hfi1_affinity_node
*entry
;
276 list_for_each(pos
, &node_affinity
.list
) {
277 entry
= list_entry(pos
, struct hfi1_affinity_node
, list
);
278 if (entry
->node
== node
)
285 static int per_cpu_affinity_get(cpumask_var_t possible_cpumask
,
286 u16 __percpu
*comp_vect_affinity
)
293 if (!possible_cpumask
) {
298 if (!comp_vect_affinity
) {
303 ret_cpu
= cpumask_first(possible_cpumask
);
304 if (ret_cpu
>= nr_cpu_ids
) {
309 prev_cntr
= *per_cpu_ptr(comp_vect_affinity
, ret_cpu
);
310 for_each_cpu(curr_cpu
, possible_cpumask
) {
311 cntr
= *per_cpu_ptr(comp_vect_affinity
, curr_cpu
);
313 if (cntr
< prev_cntr
) {
319 *per_cpu_ptr(comp_vect_affinity
, ret_cpu
) += 1;
325 static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask
,
326 u16 __percpu
*comp_vect_affinity
)
333 if (!possible_cpumask
)
336 if (!comp_vect_affinity
)
339 max_cpu
= cpumask_first(possible_cpumask
);
340 if (max_cpu
>= nr_cpu_ids
)
343 prev_cntr
= *per_cpu_ptr(comp_vect_affinity
, max_cpu
);
344 for_each_cpu(curr_cpu
, possible_cpumask
) {
345 cntr
= *per_cpu_ptr(comp_vect_affinity
, curr_cpu
);
347 if (cntr
> prev_cntr
) {
353 *per_cpu_ptr(comp_vect_affinity
, max_cpu
) -= 1;
359 * Non-interrupt CPUs are used first, then interrupt CPUs.
360 * Two already allocated cpu masks must be passed.
362 static int _dev_comp_vect_cpu_get(struct hfi1_devdata
*dd
,
363 struct hfi1_affinity_node
*entry
,
364 cpumask_var_t non_intr_cpus
,
365 cpumask_var_t available_cpus
)
366 __must_hold(&node_affinity
.lock
)
369 struct cpu_mask_set
*set
= dd
->comp_vect
;
371 lockdep_assert_held(&node_affinity
.lock
);
372 if (!non_intr_cpus
) {
377 if (!available_cpus
) {
382 /* Available CPUs for pinning completion vectors */
383 _cpu_mask_set_gen_inc(set
);
384 cpumask_andnot(available_cpus
, &set
->mask
, &set
->used
);
386 /* Available CPUs without SDMA engine interrupts */
387 cpumask_andnot(non_intr_cpus
, available_cpus
,
388 &entry
->def_intr
.used
);
390 /* If there are non-interrupt CPUs available, use them first */
391 if (!cpumask_empty(non_intr_cpus
))
392 cpu
= cpumask_first(non_intr_cpus
);
393 else /* Otherwise, use interrupt CPUs */
394 cpu
= cpumask_first(available_cpus
);
396 if (cpu
>= nr_cpu_ids
) { /* empty */
400 cpumask_set_cpu(cpu
, &set
->used
);
406 static void _dev_comp_vect_cpu_put(struct hfi1_devdata
*dd
, int cpu
)
408 struct cpu_mask_set
*set
= dd
->comp_vect
;
413 cpu_mask_set_put(set
, cpu
);
416 /* _dev_comp_vect_mappings_destroy() is reentrant */
417 static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata
*dd
)
421 if (!dd
->comp_vect_mappings
)
424 for (i
= 0; i
< dd
->comp_vect_possible_cpus
; i
++) {
425 cpu
= dd
->comp_vect_mappings
[i
];
426 _dev_comp_vect_cpu_put(dd
, cpu
);
427 dd
->comp_vect_mappings
[i
] = -1;
429 "[%s] Release CPU %d from completion vector %d",
430 rvt_get_ibdev_name(&(dd
)->verbs_dev
.rdi
), cpu
, i
);
433 kfree(dd
->comp_vect_mappings
);
434 dd
->comp_vect_mappings
= NULL
;
438 * This function creates the table for looking up CPUs for completion vectors.
439 * num_comp_vectors needs to have been initilized before calling this function.
441 static int _dev_comp_vect_mappings_create(struct hfi1_devdata
*dd
,
442 struct hfi1_affinity_node
*entry
)
443 __must_hold(&node_affinity
.lock
)
446 cpumask_var_t non_intr_cpus
;
447 cpumask_var_t available_cpus
;
449 lockdep_assert_held(&node_affinity
.lock
);
451 if (!zalloc_cpumask_var(&non_intr_cpus
, GFP_KERNEL
))
454 if (!zalloc_cpumask_var(&available_cpus
, GFP_KERNEL
)) {
455 free_cpumask_var(non_intr_cpus
);
459 dd
->comp_vect_mappings
= kcalloc(dd
->comp_vect_possible_cpus
,
460 sizeof(*dd
->comp_vect_mappings
),
462 if (!dd
->comp_vect_mappings
) {
466 for (i
= 0; i
< dd
->comp_vect_possible_cpus
; i
++)
467 dd
->comp_vect_mappings
[i
] = -1;
469 for (i
= 0; i
< dd
->comp_vect_possible_cpus
; i
++) {
470 cpu
= _dev_comp_vect_cpu_get(dd
, entry
, non_intr_cpus
,
477 dd
->comp_vect_mappings
[i
] = cpu
;
479 "[%s] Completion Vector %d -> CPU %d",
480 rvt_get_ibdev_name(&(dd
)->verbs_dev
.rdi
), i
, cpu
);
483 free_cpumask_var(available_cpus
);
484 free_cpumask_var(non_intr_cpus
);
488 free_cpumask_var(available_cpus
);
489 free_cpumask_var(non_intr_cpus
);
490 _dev_comp_vect_mappings_destroy(dd
);
495 int hfi1_comp_vectors_set_up(struct hfi1_devdata
*dd
)
498 struct hfi1_affinity_node
*entry
;
500 mutex_lock(&node_affinity
.lock
);
501 entry
= node_affinity_lookup(dd
->node
);
506 ret
= _dev_comp_vect_mappings_create(dd
, entry
);
508 mutex_unlock(&node_affinity
.lock
);
513 void hfi1_comp_vectors_clean_up(struct hfi1_devdata
*dd
)
515 _dev_comp_vect_mappings_destroy(dd
);
518 int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info
*rdi
, int comp_vect
)
520 struct hfi1_ibdev
*verbs_dev
= dev_from_rdi(rdi
);
521 struct hfi1_devdata
*dd
= dd_from_dev(verbs_dev
);
523 if (!dd
->comp_vect_mappings
)
525 if (comp_vect
>= dd
->comp_vect_possible_cpus
)
528 return dd
->comp_vect_mappings
[comp_vect
];
532 * It assumes dd->comp_vect_possible_cpus is available.
534 static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata
*dd
,
535 struct hfi1_affinity_node
*entry
,
537 __must_hold(&node_affinity
.lock
)
540 int possible_cpus_comp_vect
= 0;
541 struct cpumask
*dev_comp_vect_mask
= &dd
->comp_vect
->mask
;
543 lockdep_assert_held(&node_affinity
.lock
);
545 * If there's only one CPU available for completion vectors, then
546 * there will only be one completion vector available. Othewise,
547 * the number of completion vector available will be the number of
548 * available CPUs divide it by the number of devices in the
551 if (cpumask_weight(&entry
->comp_vect_mask
) == 1) {
552 possible_cpus_comp_vect
= 1;
554 "Number of kernel receive queues is too large for completion vector affinity to be effective\n");
556 possible_cpus_comp_vect
+=
557 cpumask_weight(&entry
->comp_vect_mask
) /
558 hfi1_per_node_cntr
[dd
->node
];
561 * If the completion vector CPUs available doesn't divide
562 * evenly among devices, then the first device device to be
563 * initialized gets an extra CPU.
565 if (first_dev_init
&&
566 cpumask_weight(&entry
->comp_vect_mask
) %
567 hfi1_per_node_cntr
[dd
->node
] != 0)
568 possible_cpus_comp_vect
++;
571 dd
->comp_vect_possible_cpus
= possible_cpus_comp_vect
;
573 /* Reserving CPUs for device completion vector */
574 for (i
= 0; i
< dd
->comp_vect_possible_cpus
; i
++) {
575 curr_cpu
= per_cpu_affinity_get(&entry
->comp_vect_mask
,
576 entry
->comp_vect_affinity
);
580 cpumask_set_cpu(curr_cpu
, dev_comp_vect_mask
);
584 "[%s] Completion vector affinity CPU set(s) %*pbl",
585 rvt_get_ibdev_name(&(dd
)->verbs_dev
.rdi
),
586 cpumask_pr_args(dev_comp_vect_mask
));
591 for (j
= 0; j
< i
; j
++)
592 per_cpu_affinity_put_max(&entry
->comp_vect_mask
,
593 entry
->comp_vect_affinity
);
599 * It assumes dd->comp_vect_possible_cpus is available.
601 static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata
*dd
,
602 struct hfi1_affinity_node
*entry
)
603 __must_hold(&node_affinity
.lock
)
607 lockdep_assert_held(&node_affinity
.lock
);
608 if (!dd
->comp_vect_possible_cpus
)
611 for (i
= 0; i
< dd
->comp_vect_possible_cpus
; i
++) {
612 cpu
= per_cpu_affinity_put_max(&dd
->comp_vect
->mask
,
613 entry
->comp_vect_affinity
);
614 /* Clearing CPU in device completion vector cpu mask */
616 cpumask_clear_cpu(cpu
, &dd
->comp_vect
->mask
);
619 dd
->comp_vect_possible_cpus
= 0;
623 * Interrupt affinity.
625 * non-rcv avail gets a default mask that
626 * starts as possible cpus with threads reset
627 * and each rcv avail reset.
629 * rcv avail gets node relative 1 wrapping back
630 * to the node relative 1 as necessary.
633 int hfi1_dev_affinity_init(struct hfi1_devdata
*dd
)
635 int node
= pcibus_to_node(dd
->pcidev
->bus
);
636 struct hfi1_affinity_node
*entry
;
637 const struct cpumask
*local_mask
;
638 int curr_cpu
, possible
, i
, ret
;
639 bool new_entry
= false;
642 * If the BIOS does not have the NUMA node information set, select
643 * NUMA 0 so we get consistent performance.
646 dd_dev_err(dd
, "Invalid PCI NUMA node. Performance may be affected\n");
651 local_mask
= cpumask_of_node(dd
->node
);
652 if (cpumask_first(local_mask
) >= nr_cpu_ids
)
653 local_mask
= topology_core_cpumask(0);
655 mutex_lock(&node_affinity
.lock
);
656 entry
= node_affinity_lookup(dd
->node
);
659 * If this is the first time this NUMA node's affinity is used,
660 * create an entry in the global affinity structure and initialize it.
663 entry
= node_affinity_allocate(node
);
666 "Unable to allocate global affinity node\n");
672 init_cpu_mask_set(&entry
->def_intr
);
673 init_cpu_mask_set(&entry
->rcv_intr
);
674 cpumask_clear(&entry
->comp_vect_mask
);
675 cpumask_clear(&entry
->general_intr_mask
);
676 /* Use the "real" cpu mask of this node as the default */
677 cpumask_and(&entry
->def_intr
.mask
, &node_affinity
.real_cpu_mask
,
680 /* fill in the receive list */
681 possible
= cpumask_weight(&entry
->def_intr
.mask
);
682 curr_cpu
= cpumask_first(&entry
->def_intr
.mask
);
685 /* only one CPU, everyone will use it */
686 cpumask_set_cpu(curr_cpu
, &entry
->rcv_intr
.mask
);
687 cpumask_set_cpu(curr_cpu
, &entry
->general_intr_mask
);
690 * The general/control context will be the first CPU in
691 * the default list, so it is removed from the default
692 * list and added to the general interrupt list.
694 cpumask_clear_cpu(curr_cpu
, &entry
->def_intr
.mask
);
695 cpumask_set_cpu(curr_cpu
, &entry
->general_intr_mask
);
696 curr_cpu
= cpumask_next(curr_cpu
,
697 &entry
->def_intr
.mask
);
700 * Remove the remaining kernel receive queues from
701 * the default list and add them to the receive list.
704 i
< (dd
->n_krcv_queues
- 1) *
705 hfi1_per_node_cntr
[dd
->node
];
707 cpumask_clear_cpu(curr_cpu
,
708 &entry
->def_intr
.mask
);
709 cpumask_set_cpu(curr_cpu
,
710 &entry
->rcv_intr
.mask
);
711 curr_cpu
= cpumask_next(curr_cpu
,
712 &entry
->def_intr
.mask
);
713 if (curr_cpu
>= nr_cpu_ids
)
718 * If there ends up being 0 CPU cores leftover for SDMA
719 * engines, use the same CPU cores as general/control
722 if (cpumask_weight(&entry
->def_intr
.mask
) == 0)
723 cpumask_copy(&entry
->def_intr
.mask
,
724 &entry
->general_intr_mask
);
727 /* Determine completion vector CPUs for the entire node */
728 cpumask_and(&entry
->comp_vect_mask
,
729 &node_affinity
.real_cpu_mask
, local_mask
);
730 cpumask_andnot(&entry
->comp_vect_mask
,
731 &entry
->comp_vect_mask
,
732 &entry
->rcv_intr
.mask
);
733 cpumask_andnot(&entry
->comp_vect_mask
,
734 &entry
->comp_vect_mask
,
735 &entry
->general_intr_mask
);
738 * If there ends up being 0 CPU cores leftover for completion
739 * vectors, use the same CPU core as the general/control
742 if (cpumask_weight(&entry
->comp_vect_mask
) == 0)
743 cpumask_copy(&entry
->comp_vect_mask
,
744 &entry
->general_intr_mask
);
747 ret
= _dev_comp_vect_cpu_mask_init(dd
, entry
, new_entry
);
752 node_affinity_add_tail(entry
);
754 mutex_unlock(&node_affinity
.lock
);
760 node_affinity_destroy(entry
);
761 mutex_unlock(&node_affinity
.lock
);
765 void hfi1_dev_affinity_clean_up(struct hfi1_devdata
*dd
)
767 struct hfi1_affinity_node
*entry
;
772 mutex_lock(&node_affinity
.lock
);
773 entry
= node_affinity_lookup(dd
->node
);
778 * Free device completion vector CPUs to be used by future
781 _dev_comp_vect_cpu_mask_clean_up(dd
, entry
);
783 mutex_unlock(&node_affinity
.lock
);
784 dd
->node
= NUMA_NO_NODE
;
788 * Function updates the irq affinity hint for msix after it has been changed
789 * by the user using the /proc/irq interface. This function only accepts
790 * one cpu in the mask.
792 static void hfi1_update_sdma_affinity(struct hfi1_msix_entry
*msix
, int cpu
)
794 struct sdma_engine
*sde
= msix
->arg
;
795 struct hfi1_devdata
*dd
= sde
->dd
;
796 struct hfi1_affinity_node
*entry
;
797 struct cpu_mask_set
*set
;
800 if (cpu
> num_online_cpus() || cpu
== sde
->cpu
)
803 mutex_lock(&node_affinity
.lock
);
804 entry
= node_affinity_lookup(dd
->node
);
810 cpumask_clear(&msix
->mask
);
811 cpumask_set_cpu(cpu
, &msix
->mask
);
812 dd_dev_dbg(dd
, "IRQ: %u, type %s engine %u -> cpu: %d\n",
813 msix
->irq
, irq_type_names
[msix
->type
],
815 irq_set_affinity_hint(msix
->irq
, &msix
->mask
);
818 * Set the new cpu in the hfi1_affinity_node and clean
819 * the old cpu if it is not used by any other IRQ
821 set
= &entry
->def_intr
;
822 cpumask_set_cpu(cpu
, &set
->mask
);
823 cpumask_set_cpu(cpu
, &set
->used
);
824 for (i
= 0; i
< dd
->msix_info
.max_requested
; i
++) {
825 struct hfi1_msix_entry
*other_msix
;
827 other_msix
= &dd
->msix_info
.msix_entries
[i
];
828 if (other_msix
->type
!= IRQ_SDMA
|| other_msix
== msix
)
831 if (cpumask_test_cpu(old_cpu
, &other_msix
->mask
))
834 cpumask_clear_cpu(old_cpu
, &set
->mask
);
835 cpumask_clear_cpu(old_cpu
, &set
->used
);
837 mutex_unlock(&node_affinity
.lock
);
840 static void hfi1_irq_notifier_notify(struct irq_affinity_notify
*notify
,
841 const cpumask_t
*mask
)
843 int cpu
= cpumask_first(mask
);
844 struct hfi1_msix_entry
*msix
= container_of(notify
,
845 struct hfi1_msix_entry
,
848 /* Only one CPU configuration supported currently */
849 hfi1_update_sdma_affinity(msix
, cpu
);
852 static void hfi1_irq_notifier_release(struct kref
*ref
)
855 * This is required by affinity notifier. We don't have anything to
860 static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry
*msix
)
862 struct irq_affinity_notify
*notify
= &msix
->notify
;
864 notify
->irq
= msix
->irq
;
865 notify
->notify
= hfi1_irq_notifier_notify
;
866 notify
->release
= hfi1_irq_notifier_release
;
868 if (irq_set_affinity_notifier(notify
->irq
, notify
))
869 pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
873 static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry
*msix
)
875 struct irq_affinity_notify
*notify
= &msix
->notify
;
877 if (irq_set_affinity_notifier(notify
->irq
, NULL
))
878 pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
883 * Function sets the irq affinity for msix.
884 * It *must* be called with node_affinity.lock held.
886 static int get_irq_affinity(struct hfi1_devdata
*dd
,
887 struct hfi1_msix_entry
*msix
)
890 struct hfi1_affinity_node
*entry
;
891 struct cpu_mask_set
*set
= NULL
;
892 struct sdma_engine
*sde
= NULL
;
893 struct hfi1_ctxtdata
*rcd
= NULL
;
898 cpumask_clear(&msix
->mask
);
900 entry
= node_affinity_lookup(dd
->node
);
902 switch (msix
->type
) {
904 sde
= (struct sdma_engine
*)msix
->arg
;
905 scnprintf(extra
, 64, "engine %u", sde
->this_idx
);
906 set
= &entry
->def_intr
;
909 cpu
= cpumask_first(&entry
->general_intr_mask
);
912 rcd
= (struct hfi1_ctxtdata
*)msix
->arg
;
913 if (rcd
->ctxt
== HFI1_CTRL_CTXT
)
914 cpu
= cpumask_first(&entry
->general_intr_mask
);
916 set
= &entry
->rcv_intr
;
917 scnprintf(extra
, 64, "ctxt %u", rcd
->ctxt
);
920 rcd
= (struct hfi1_ctxtdata
*)msix
->arg
;
921 set
= &entry
->def_intr
;
922 scnprintf(extra
, 64, "ctxt %u", rcd
->ctxt
);
925 dd_dev_err(dd
, "Invalid IRQ type %d\n", msix
->type
);
930 * The general and control contexts are placed on a particular
931 * CPU, which is set above. Skip accounting for it. Everything else
932 * finds its CPU here.
934 if (cpu
== -1 && set
) {
935 if (!zalloc_cpumask_var(&diff
, GFP_KERNEL
))
938 cpu
= cpu_mask_set_get_first(set
, diff
);
940 free_cpumask_var(diff
);
941 dd_dev_err(dd
, "Failure to obtain CPU for IRQ\n");
945 free_cpumask_var(diff
);
948 cpumask_set_cpu(cpu
, &msix
->mask
);
949 dd_dev_info(dd
, "IRQ: %u, type %s %s -> cpu: %d\n",
950 msix
->irq
, irq_type_names
[msix
->type
],
952 irq_set_affinity_hint(msix
->irq
, &msix
->mask
);
954 if (msix
->type
== IRQ_SDMA
) {
956 hfi1_setup_sdma_notifier(msix
);
962 int hfi1_get_irq_affinity(struct hfi1_devdata
*dd
, struct hfi1_msix_entry
*msix
)
966 mutex_lock(&node_affinity
.lock
);
967 ret
= get_irq_affinity(dd
, msix
);
968 mutex_unlock(&node_affinity
.lock
);
972 void hfi1_put_irq_affinity(struct hfi1_devdata
*dd
,
973 struct hfi1_msix_entry
*msix
)
975 struct cpu_mask_set
*set
= NULL
;
976 struct hfi1_ctxtdata
*rcd
;
977 struct hfi1_affinity_node
*entry
;
979 mutex_lock(&node_affinity
.lock
);
980 entry
= node_affinity_lookup(dd
->node
);
982 switch (msix
->type
) {
984 set
= &entry
->def_intr
;
985 hfi1_cleanup_sdma_notifier(msix
);
988 /* Don't do accounting for general contexts */
991 rcd
= (struct hfi1_ctxtdata
*)msix
->arg
;
992 /* Don't do accounting for control contexts */
993 if (rcd
->ctxt
!= HFI1_CTRL_CTXT
)
994 set
= &entry
->rcv_intr
;
997 rcd
= (struct hfi1_ctxtdata
*)msix
->arg
;
998 set
= &entry
->def_intr
;
1001 mutex_unlock(&node_affinity
.lock
);
1006 cpumask_andnot(&set
->used
, &set
->used
, &msix
->mask
);
1007 _cpu_mask_set_gen_dec(set
);
1010 irq_set_affinity_hint(msix
->irq
, NULL
);
1011 cpumask_clear(&msix
->mask
);
1012 mutex_unlock(&node_affinity
.lock
);
1015 /* This should be called with node_affinity.lock held */
1016 static void find_hw_thread_mask(uint hw_thread_no
, cpumask_var_t hw_thread_mask
,
1017 struct hfi1_affinity_node_list
*affinity
)
1019 int possible
, curr_cpu
, i
;
1020 uint num_cores_per_socket
= node_affinity
.num_online_cpus
/
1021 affinity
->num_core_siblings
/
1022 node_affinity
.num_online_nodes
;
1024 cpumask_copy(hw_thread_mask
, &affinity
->proc
.mask
);
1025 if (affinity
->num_core_siblings
> 0) {
1026 /* Removing other siblings not needed for now */
1027 possible
= cpumask_weight(hw_thread_mask
);
1028 curr_cpu
= cpumask_first(hw_thread_mask
);
1030 i
< num_cores_per_socket
* node_affinity
.num_online_nodes
;
1032 curr_cpu
= cpumask_next(curr_cpu
, hw_thread_mask
);
1034 for (; i
< possible
; i
++) {
1035 cpumask_clear_cpu(curr_cpu
, hw_thread_mask
);
1036 curr_cpu
= cpumask_next(curr_cpu
, hw_thread_mask
);
1039 /* Identifying correct HW threads within physical cores */
1040 cpumask_shift_left(hw_thread_mask
, hw_thread_mask
,
1041 num_cores_per_socket
*
1042 node_affinity
.num_online_nodes
*
1047 int hfi1_get_proc_affinity(int node
)
1049 int cpu
= -1, ret
, i
;
1050 struct hfi1_affinity_node
*entry
;
1051 cpumask_var_t diff
, hw_thread_mask
, available_mask
, intrs_mask
;
1052 const struct cpumask
*node_mask
,
1053 *proc_mask
= current
->cpus_ptr
;
1054 struct hfi1_affinity_node_list
*affinity
= &node_affinity
;
1055 struct cpu_mask_set
*set
= &affinity
->proc
;
1058 * check whether process/context affinity has already
1061 if (current
->nr_cpus_allowed
== 1) {
1062 hfi1_cdbg(PROC
, "PID %u %s affinity set to CPU %*pbl",
1063 current
->pid
, current
->comm
,
1064 cpumask_pr_args(proc_mask
));
1066 * Mark the pre-set CPU as used. This is atomic so we don't
1069 cpu
= cpumask_first(proc_mask
);
1070 cpumask_set_cpu(cpu
, &set
->used
);
1072 } else if (current
->nr_cpus_allowed
< cpumask_weight(&set
->mask
)) {
1073 hfi1_cdbg(PROC
, "PID %u %s affinity set to CPU set(s) %*pbl",
1074 current
->pid
, current
->comm
,
1075 cpumask_pr_args(proc_mask
));
1080 * The process does not have a preset CPU affinity so find one to
1081 * recommend using the following algorithm:
1083 * For each user process that is opening a context on HFI Y:
1084 * a) If all cores are filled, reinitialize the bitmask
1085 * b) Fill real cores first, then HT cores (First set of HT
1086 * cores on all physical cores, then second set of HT core,
1087 * and, so on) in the following order:
1089 * 1. Same NUMA node as HFI Y and not running an IRQ
1091 * 2. Same NUMA node as HFI Y and running an IRQ handler
1092 * 3. Different NUMA node to HFI Y and not running an IRQ
1094 * 4. Different NUMA node to HFI Y and running an IRQ
1096 * c) Mark core as filled in the bitmask. As user processes are
1097 * done, clear cores from the bitmask.
1100 ret
= zalloc_cpumask_var(&diff
, GFP_KERNEL
);
1103 ret
= zalloc_cpumask_var(&hw_thread_mask
, GFP_KERNEL
);
1106 ret
= zalloc_cpumask_var(&available_mask
, GFP_KERNEL
);
1108 goto free_hw_thread_mask
;
1109 ret
= zalloc_cpumask_var(&intrs_mask
, GFP_KERNEL
);
1111 goto free_available_mask
;
1113 mutex_lock(&affinity
->lock
);
1115 * If we've used all available HW threads, clear the mask and start
1118 _cpu_mask_set_gen_inc(set
);
1121 * If NUMA node has CPUs used by interrupt handlers, include them in the
1122 * interrupt handler mask.
1124 entry
= node_affinity_lookup(node
);
1126 cpumask_copy(intrs_mask
, (entry
->def_intr
.gen
?
1127 &entry
->def_intr
.mask
:
1128 &entry
->def_intr
.used
));
1129 cpumask_or(intrs_mask
, intrs_mask
, (entry
->rcv_intr
.gen
?
1130 &entry
->rcv_intr
.mask
:
1131 &entry
->rcv_intr
.used
));
1132 cpumask_or(intrs_mask
, intrs_mask
, &entry
->general_intr_mask
);
1134 hfi1_cdbg(PROC
, "CPUs used by interrupts: %*pbl",
1135 cpumask_pr_args(intrs_mask
));
1137 cpumask_copy(hw_thread_mask
, &set
->mask
);
1140 * If HT cores are enabled, identify which HW threads within the
1141 * physical cores should be used.
1143 if (affinity
->num_core_siblings
> 0) {
1144 for (i
= 0; i
< affinity
->num_core_siblings
; i
++) {
1145 find_hw_thread_mask(i
, hw_thread_mask
, affinity
);
1148 * If there's at least one available core for this HW
1149 * thread number, stop looking for a core.
1151 * diff will always be not empty at least once in this
1152 * loop as the used mask gets reset when
1153 * (set->mask == set->used) before this loop.
1155 cpumask_andnot(diff
, hw_thread_mask
, &set
->used
);
1156 if (!cpumask_empty(diff
))
1160 hfi1_cdbg(PROC
, "Same available HW thread on all physical CPUs: %*pbl",
1161 cpumask_pr_args(hw_thread_mask
));
1163 node_mask
= cpumask_of_node(node
);
1164 hfi1_cdbg(PROC
, "Device on NUMA %u, CPUs %*pbl", node
,
1165 cpumask_pr_args(node_mask
));
1167 /* Get cpumask of available CPUs on preferred NUMA */
1168 cpumask_and(available_mask
, hw_thread_mask
, node_mask
);
1169 cpumask_andnot(available_mask
, available_mask
, &set
->used
);
1170 hfi1_cdbg(PROC
, "Available CPUs on NUMA %u: %*pbl", node
,
1171 cpumask_pr_args(available_mask
));
1174 * At first, we don't want to place processes on the same
1175 * CPUs as interrupt handlers. Then, CPUs running interrupt
1176 * handlers are used.
1178 * 1) If diff is not empty, then there are CPUs not running
1179 * non-interrupt handlers available, so diff gets copied
1180 * over to available_mask.
1181 * 2) If diff is empty, then all CPUs not running interrupt
1182 * handlers are taken, so available_mask contains all
1183 * available CPUs running interrupt handlers.
1184 * 3) If available_mask is empty, then all CPUs on the
1185 * preferred NUMA node are taken, so other NUMA nodes are
1186 * used for process assignments using the same method as
1187 * the preferred NUMA node.
1189 cpumask_andnot(diff
, available_mask
, intrs_mask
);
1190 if (!cpumask_empty(diff
))
1191 cpumask_copy(available_mask
, diff
);
1193 /* If we don't have CPUs on the preferred node, use other NUMA nodes */
1194 if (cpumask_empty(available_mask
)) {
1195 cpumask_andnot(available_mask
, hw_thread_mask
, &set
->used
);
1196 /* Excluding preferred NUMA cores */
1197 cpumask_andnot(available_mask
, available_mask
, node_mask
);
1199 "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
1200 cpumask_pr_args(available_mask
));
1203 * At first, we don't want to place processes on the same
1204 * CPUs as interrupt handlers.
1206 cpumask_andnot(diff
, available_mask
, intrs_mask
);
1207 if (!cpumask_empty(diff
))
1208 cpumask_copy(available_mask
, diff
);
1210 hfi1_cdbg(PROC
, "Possible CPUs for process: %*pbl",
1211 cpumask_pr_args(available_mask
));
1213 cpu
= cpumask_first(available_mask
);
1214 if (cpu
>= nr_cpu_ids
) /* empty */
1217 cpumask_set_cpu(cpu
, &set
->used
);
1219 mutex_unlock(&affinity
->lock
);
1220 hfi1_cdbg(PROC
, "Process assigned to CPU %d", cpu
);
1222 free_cpumask_var(intrs_mask
);
1223 free_available_mask
:
1224 free_cpumask_var(available_mask
);
1225 free_hw_thread_mask
:
1226 free_cpumask_var(hw_thread_mask
);
1228 free_cpumask_var(diff
);
1233 void hfi1_put_proc_affinity(int cpu
)
1235 struct hfi1_affinity_node_list
*affinity
= &node_affinity
;
1236 struct cpu_mask_set
*set
= &affinity
->proc
;
1241 mutex_lock(&affinity
->lock
);
1242 cpu_mask_set_put(set
, cpu
);
1243 hfi1_cdbg(PROC
, "Returning CPU %d for future process assignment", cpu
);
1244 mutex_unlock(&affinity
->lock
);