1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright 2024 Rivos Inc.
7 #include <linux/cpumask.h>
8 #include <linux/jump_label.h>
9 #include <linux/kthread.h>
11 #include <linux/smp.h>
12 #include <linux/types.h>
13 #include <asm/cpufeature.h>
14 #include <asm/hwprobe.h>
15 #include <asm/vector.h>
17 #include "copy-unaligned.h"
19 #define MISALIGNED_ACCESS_JIFFIES_LG2 1
20 #define MISALIGNED_BUFFER_SIZE 0x4000
21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
24 DEFINE_PER_CPU(long, misaligned_access_speed
) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN
;
25 DEFINE_PER_CPU(long, vector_misaligned_access
) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED
;
27 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
28 static cpumask_t fast_misaligned_access
;
29 static int check_unaligned_access(void *param
)
31 int cpu
= smp_processor_id();
32 u64 start_cycles
, end_cycles
;
36 unsigned long start_jiffies
, now
;
37 struct page
*page
= param
;
40 long speed
= RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW
;
42 if (per_cpu(misaligned_access_speed
, cpu
) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN
)
45 /* Make an unaligned destination buffer. */
46 dst
= (void *)((unsigned long)page_address(page
) | 0x1);
47 /* Unalign src as well, but differently (off by 1 + 2 = 3). */
48 src
= dst
+ (MISALIGNED_BUFFER_SIZE
/ 2);
52 __riscv_copy_words_unaligned(dst
, src
, MISALIGNED_COPY_SIZE
);
54 start_jiffies
= jiffies
;
55 while ((now
= jiffies
) == start_jiffies
)
59 * For a fixed amount of time, repeatedly try the function, and take
60 * the best time in cycles as the measurement.
62 while (time_before(jiffies
, now
+ (1 << MISALIGNED_ACCESS_JIFFIES_LG2
))) {
63 start_cycles
= get_cycles64();
64 /* Ensure the CSR read can't reorder WRT to the copy. */
66 __riscv_copy_words_unaligned(dst
, src
, MISALIGNED_COPY_SIZE
);
67 /* Ensure the copy ends before the end time is snapped. */
69 end_cycles
= get_cycles64();
70 if ((end_cycles
- start_cycles
) < word_cycles
)
71 word_cycles
= end_cycles
- start_cycles
;
75 __riscv_copy_bytes_unaligned(dst
, src
, MISALIGNED_COPY_SIZE
);
76 start_jiffies
= jiffies
;
77 while ((now
= jiffies
) == start_jiffies
)
80 while (time_before(jiffies
, now
+ (1 << MISALIGNED_ACCESS_JIFFIES_LG2
))) {
81 start_cycles
= get_cycles64();
83 __riscv_copy_bytes_unaligned(dst
, src
, MISALIGNED_COPY_SIZE
);
85 end_cycles
= get_cycles64();
86 if ((end_cycles
- start_cycles
) < byte_cycles
)
87 byte_cycles
= end_cycles
- start_cycles
;
92 /* Don't divide by zero. */
93 if (!word_cycles
|| !byte_cycles
) {
94 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
100 if (word_cycles
< byte_cycles
)
101 speed
= RISCV_HWPROBE_MISALIGNED_SCALAR_FAST
;
103 ratio
= div_u64((byte_cycles
* 100), word_cycles
);
104 pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
108 (speed
== RISCV_HWPROBE_MISALIGNED_SCALAR_FAST
) ? "fast" : "slow");
110 per_cpu(misaligned_access_speed
, cpu
) = speed
;
113 * Set the value of fast_misaligned_access of a CPU. These operations
114 * are atomic to avoid race conditions.
116 if (speed
== RISCV_HWPROBE_MISALIGNED_SCALAR_FAST
)
117 cpumask_set_cpu(cpu
, &fast_misaligned_access
);
119 cpumask_clear_cpu(cpu
, &fast_misaligned_access
);
124 static void check_unaligned_access_nonboot_cpu(void *param
)
126 unsigned int cpu
= smp_processor_id();
127 struct page
**pages
= param
;
129 if (smp_processor_id() != 0)
130 check_unaligned_access(pages
[cpu
]);
133 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key
);
135 static void modify_unaligned_access_branches(cpumask_t
*mask
, int weight
)
137 if (cpumask_weight(mask
) == weight
)
138 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key
);
140 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key
);
143 static void set_unaligned_access_static_branches_except_cpu(int cpu
)
146 * Same as set_unaligned_access_static_branches, except excludes the
147 * given CPU from the result. When a CPU is hotplugged into an offline
148 * state, this function is called before the CPU is set to offline in
149 * the cpumask, and thus the CPU needs to be explicitly excluded.
152 cpumask_t fast_except_me
;
154 cpumask_and(&fast_except_me
, &fast_misaligned_access
, cpu_online_mask
);
155 cpumask_clear_cpu(cpu
, &fast_except_me
);
157 modify_unaligned_access_branches(&fast_except_me
, num_online_cpus() - 1);
160 static void set_unaligned_access_static_branches(void)
163 * This will be called after check_unaligned_access_all_cpus so the
164 * result of unaligned access speed for all CPUs will be available.
166 * To avoid the number of online cpus changing between reading
167 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
168 * held before calling this function.
171 cpumask_t fast_and_online
;
173 cpumask_and(&fast_and_online
, &fast_misaligned_access
, cpu_online_mask
);
175 modify_unaligned_access_branches(&fast_and_online
, num_online_cpus());
178 static int lock_and_set_unaligned_access_static_branch(void)
181 set_unaligned_access_static_branches();
187 arch_initcall_sync(lock_and_set_unaligned_access_static_branch
);
189 static int riscv_online_cpu(unsigned int cpu
)
191 static struct page
*buf
;
193 /* We are already set since the last check */
194 if (per_cpu(misaligned_access_speed
, cpu
) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN
)
197 check_unaligned_access_emulated(NULL
);
198 buf
= alloc_pages(GFP_KERNEL
, MISALIGNED_BUFFER_ORDER
);
200 pr_warn("Allocation failure, not measuring misaligned performance\n");
204 check_unaligned_access(buf
);
205 __free_pages(buf
, MISALIGNED_BUFFER_ORDER
);
208 set_unaligned_access_static_branches();
213 static int riscv_offline_cpu(unsigned int cpu
)
215 set_unaligned_access_static_branches_except_cpu(cpu
);
220 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
221 static int check_unaligned_access_speed_all_cpus(void)
224 unsigned int cpu_count
= num_possible_cpus();
225 struct page
**bufs
= kcalloc(cpu_count
, sizeof(*bufs
), GFP_KERNEL
);
228 pr_warn("Allocation failure, not measuring misaligned performance\n");
233 * Allocate separate buffers for each CPU so there's no fighting over
236 for_each_cpu(cpu
, cpu_online_mask
) {
237 bufs
[cpu
] = alloc_pages(GFP_KERNEL
, MISALIGNED_BUFFER_ORDER
);
239 pr_warn("Allocation failure, not measuring misaligned performance\n");
244 /* Check everybody except 0, who stays behind to tend jiffies. */
245 on_each_cpu(check_unaligned_access_nonboot_cpu
, bufs
, 1);
248 smp_call_on_cpu(0, check_unaligned_access
, bufs
[0], true);
251 * Setup hotplug callbacks for any new CPUs that come online or go
254 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN
, "riscv:online",
255 riscv_online_cpu
, riscv_offline_cpu
);
258 for_each_cpu(cpu
, cpu_online_mask
) {
260 __free_pages(bufs
[cpu
], MISALIGNED_BUFFER_ORDER
);
266 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
267 static int check_unaligned_access_speed_all_cpus(void)
273 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
274 static void check_vector_unaligned_access(struct work_struct
*work __always_unused
)
276 int cpu
= smp_processor_id();
277 u64 start_cycles
, end_cycles
;
281 unsigned long start_jiffies
, now
;
285 long speed
= RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW
;
287 if (per_cpu(vector_misaligned_access
, cpu
) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN
)
290 page
= alloc_pages(GFP_KERNEL
, MISALIGNED_BUFFER_ORDER
);
292 pr_warn("Allocation failure, not measuring vector misaligned performance\n");
296 /* Make an unaligned destination buffer. */
297 dst
= (void *)((unsigned long)page_address(page
) | 0x1);
298 /* Unalign src as well, but differently (off by 1 + 2 = 3). */
299 src
= dst
+ (MISALIGNED_BUFFER_SIZE
/ 2);
304 kernel_vector_begin();
305 __riscv_copy_vec_words_unaligned(dst
, src
, MISALIGNED_COPY_SIZE
);
307 start_jiffies
= jiffies
;
308 while ((now
= jiffies
) == start_jiffies
)
312 * For a fixed amount of time, repeatedly try the function, and take
313 * the best time in cycles as the measurement.
315 while (time_before(jiffies
, now
+ (1 << MISALIGNED_ACCESS_JIFFIES_LG2
))) {
316 start_cycles
= get_cycles64();
317 /* Ensure the CSR read can't reorder WRT to the copy. */
319 __riscv_copy_vec_words_unaligned(dst
, src
, MISALIGNED_COPY_SIZE
);
320 /* Ensure the copy ends before the end time is snapped. */
322 end_cycles
= get_cycles64();
323 if ((end_cycles
- start_cycles
) < word_cycles
)
324 word_cycles
= end_cycles
- start_cycles
;
328 __riscv_copy_vec_bytes_unaligned(dst
, src
, MISALIGNED_COPY_SIZE
);
329 start_jiffies
= jiffies
;
330 while ((now
= jiffies
) == start_jiffies
)
333 while (time_before(jiffies
, now
+ (1 << MISALIGNED_ACCESS_JIFFIES_LG2
))) {
334 start_cycles
= get_cycles64();
335 /* Ensure the CSR read can't reorder WRT to the copy. */
337 __riscv_copy_vec_bytes_unaligned(dst
, src
, MISALIGNED_COPY_SIZE
);
338 /* Ensure the copy ends before the end time is snapped. */
340 end_cycles
= get_cycles64();
341 if ((end_cycles
- start_cycles
) < byte_cycles
)
342 byte_cycles
= end_cycles
- start_cycles
;
347 /* Don't divide by zero. */
348 if (!word_cycles
|| !byte_cycles
) {
349 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n",
355 if (word_cycles
< byte_cycles
)
356 speed
= RISCV_HWPROBE_MISALIGNED_VECTOR_FAST
;
358 ratio
= div_u64((byte_cycles
* 100), word_cycles
);
359 pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n",
363 (speed
== RISCV_HWPROBE_MISALIGNED_VECTOR_FAST
) ? "fast" : "slow");
365 per_cpu(vector_misaligned_access
, cpu
) = speed
;
368 static int riscv_online_cpu_vec(unsigned int cpu
)
373 if (per_cpu(vector_misaligned_access
, cpu
) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED
)
376 check_vector_unaligned_access_emulated(NULL
);
377 check_vector_unaligned_access(NULL
);
381 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
382 static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused
)
384 schedule_on_each_cpu(check_vector_unaligned_access
);
387 * Setup hotplug callbacks for any new CPUs that come online or go
390 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN
, "riscv:online",
391 riscv_online_cpu_vec
, NULL
);
395 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
396 static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused
)
402 static int check_unaligned_access_all_cpus(void)
404 bool all_cpus_emulated
, all_cpus_vec_unsupported
;
406 all_cpus_emulated
= check_unaligned_access_emulated_all_cpus();
407 all_cpus_vec_unsupported
= check_vector_unaligned_access_emulated_all_cpus();
409 if (!all_cpus_vec_unsupported
&&
410 IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
)) {
411 kthread_run(vec_check_unaligned_access_speed_all_cpus
,
412 NULL
, "vec_check_unaligned_access_speed_all_cpus");
415 if (!all_cpus_emulated
)
416 return check_unaligned_access_speed_all_cpus();
421 arch_initcall(check_unaligned_access_all_cpus
);