1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2020 HiSilicon Limited.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/debugfs.h>
9 #include <linux/delay.h>
10 #include <linux/device.h>
11 #include <linux/dma-mapping.h>
12 #include <linux/kernel.h>
13 #include <linux/kthread.h>
14 #include <linux/map_benchmark.h>
15 #include <linux/math64.h>
16 #include <linux/module.h>
17 #include <linux/pci.h>
18 #include <linux/platform_device.h>
19 #include <linux/slab.h>
20 #include <linux/timekeeping.h>
22 struct map_benchmark_data
{
23 struct map_benchmark bparam
;
25 struct dentry
*debugfs
;
26 enum dma_data_direction dir
;
27 atomic64_t sum_map_100ns
;
28 atomic64_t sum_unmap_100ns
;
29 atomic64_t sum_sq_map
;
30 atomic64_t sum_sq_unmap
;
34 static int map_benchmark_thread(void *data
)
38 struct map_benchmark_data
*map
= data
;
39 int npages
= map
->bparam
.granule
;
40 u64 size
= npages
* PAGE_SIZE
;
43 buf
= alloc_pages_exact(size
, GFP_KERNEL
);
47 while (!kthread_should_stop()) {
48 u64 map_100ns
, unmap_100ns
, map_sq
, unmap_sq
;
49 ktime_t map_stime
, map_etime
, unmap_stime
, unmap_etime
;
50 ktime_t map_delta
, unmap_delta
;
53 * for a non-coherent device, if we don't stain them in the
54 * cache, this will give an underestimate of the real-world
55 * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
56 * 66 means evertything goes well! 66 is lucky.
58 if (map
->dir
!= DMA_FROM_DEVICE
)
59 memset(buf
, 0x66, size
);
61 map_stime
= ktime_get();
62 dma_addr
= dma_map_single(map
->dev
, buf
, size
, map
->dir
);
63 if (unlikely(dma_mapping_error(map
->dev
, dma_addr
))) {
64 pr_err("dma_map_single failed on %s\n",
69 map_etime
= ktime_get();
70 map_delta
= ktime_sub(map_etime
, map_stime
);
72 /* Pretend DMA is transmitting */
73 ndelay(map
->bparam
.dma_trans_ns
);
75 unmap_stime
= ktime_get();
76 dma_unmap_single(map
->dev
, dma_addr
, size
, map
->dir
);
77 unmap_etime
= ktime_get();
78 unmap_delta
= ktime_sub(unmap_etime
, unmap_stime
);
80 /* calculate sum and sum of squares */
82 map_100ns
= div64_ul(map_delta
, 100);
83 unmap_100ns
= div64_ul(unmap_delta
, 100);
84 map_sq
= map_100ns
* map_100ns
;
85 unmap_sq
= unmap_100ns
* unmap_100ns
;
87 atomic64_add(map_100ns
, &map
->sum_map_100ns
);
88 atomic64_add(unmap_100ns
, &map
->sum_unmap_100ns
);
89 atomic64_add(map_sq
, &map
->sum_sq_map
);
90 atomic64_add(unmap_sq
, &map
->sum_sq_unmap
);
91 atomic64_inc(&map
->loops
);
94 * We may test for a long time so periodically check whether
95 * we need to schedule to avoid starving the others. Otherwise
96 * we may hangup the kernel in a non-preemptible kernel when
97 * the test kthreads number >= CPU number, the test kthreads
98 * will run endless on every CPU since the thread resposible
99 * for notifying the kthread stop (in do_map_benchmark())
100 * could not be scheduled.
102 * Note this may degrade the test concurrency since the test
103 * threads may need to share the CPU time with other load
104 * in the system. So it's recommended to run this benchmark
111 free_pages_exact(buf
, size
);
115 static int do_map_benchmark(struct map_benchmark_data
*map
)
117 struct task_struct
**tsk
;
118 int threads
= map
->bparam
.threads
;
119 int node
= map
->bparam
.node
;
124 tsk
= kmalloc_array(threads
, sizeof(*tsk
), GFP_KERNEL
);
128 get_device(map
->dev
);
130 for (i
= 0; i
< threads
; i
++) {
131 tsk
[i
] = kthread_create_on_node(map_benchmark_thread
, map
,
132 map
->bparam
.node
, "dma-map-benchmark/%d", i
);
133 if (IS_ERR(tsk
[i
])) {
134 pr_err("create dma_map thread failed\n");
135 ret
= PTR_ERR(tsk
[i
]);
137 kthread_stop(tsk
[i
]);
141 if (node
!= NUMA_NO_NODE
)
142 kthread_bind_mask(tsk
[i
], cpumask_of_node(node
));
145 /* clear the old value in the previous benchmark */
146 atomic64_set(&map
->sum_map_100ns
, 0);
147 atomic64_set(&map
->sum_unmap_100ns
, 0);
148 atomic64_set(&map
->sum_sq_map
, 0);
149 atomic64_set(&map
->sum_sq_unmap
, 0);
150 atomic64_set(&map
->loops
, 0);
152 for (i
= 0; i
< threads
; i
++) {
153 get_task_struct(tsk
[i
]);
154 wake_up_process(tsk
[i
]);
157 msleep_interruptible(map
->bparam
.seconds
* 1000);
159 /* wait for the completion of all started benchmark threads */
160 for (i
= 0; i
< threads
; i
++) {
161 int kthread_ret
= kthread_stop_put(tsk
[i
]);
170 loops
= atomic64_read(&map
->loops
);
171 if (likely(loops
> 0)) {
172 u64 map_variance
, unmap_variance
;
173 u64 sum_map
= atomic64_read(&map
->sum_map_100ns
);
174 u64 sum_unmap
= atomic64_read(&map
->sum_unmap_100ns
);
175 u64 sum_sq_map
= atomic64_read(&map
->sum_sq_map
);
176 u64 sum_sq_unmap
= atomic64_read(&map
->sum_sq_unmap
);
178 /* average latency */
179 map
->bparam
.avg_map_100ns
= div64_u64(sum_map
, loops
);
180 map
->bparam
.avg_unmap_100ns
= div64_u64(sum_unmap
, loops
);
182 /* standard deviation of latency */
183 map_variance
= div64_u64(sum_sq_map
, loops
) -
184 map
->bparam
.avg_map_100ns
*
185 map
->bparam
.avg_map_100ns
;
186 unmap_variance
= div64_u64(sum_sq_unmap
, loops
) -
187 map
->bparam
.avg_unmap_100ns
*
188 map
->bparam
.avg_unmap_100ns
;
189 map
->bparam
.map_stddev
= int_sqrt64(map_variance
);
190 map
->bparam
.unmap_stddev
= int_sqrt64(unmap_variance
);
194 put_device(map
->dev
);
199 static long map_benchmark_ioctl(struct file
*file
, unsigned int cmd
,
202 struct map_benchmark_data
*map
= file
->private_data
;
203 void __user
*argp
= (void __user
*)arg
;
207 if (copy_from_user(&map
->bparam
, argp
, sizeof(map
->bparam
)))
211 case DMA_MAP_BENCHMARK
:
212 if (map
->bparam
.threads
== 0 ||
213 map
->bparam
.threads
> DMA_MAP_MAX_THREADS
) {
214 pr_err("invalid thread number\n");
218 if (map
->bparam
.seconds
== 0 ||
219 map
->bparam
.seconds
> DMA_MAP_MAX_SECONDS
) {
220 pr_err("invalid duration seconds\n");
224 if (map
->bparam
.dma_trans_ns
> DMA_MAP_MAX_TRANS_DELAY
) {
225 pr_err("invalid transmission delay\n");
229 if (map
->bparam
.node
!= NUMA_NO_NODE
&&
230 (map
->bparam
.node
< 0 || map
->bparam
.node
>= MAX_NUMNODES
||
231 !node_possible(map
->bparam
.node
))) {
232 pr_err("invalid numa node\n");
236 if (map
->bparam
.granule
< 1 || map
->bparam
.granule
> 1024) {
237 pr_err("invalid granule size\n");
241 switch (map
->bparam
.dma_dir
) {
242 case DMA_MAP_BIDIRECTIONAL
:
243 map
->dir
= DMA_BIDIRECTIONAL
;
245 case DMA_MAP_FROM_DEVICE
:
246 map
->dir
= DMA_FROM_DEVICE
;
248 case DMA_MAP_TO_DEVICE
:
249 map
->dir
= DMA_TO_DEVICE
;
252 pr_err("invalid DMA direction\n");
256 old_dma_mask
= dma_get_mask(map
->dev
);
258 ret
= dma_set_mask(map
->dev
,
259 DMA_BIT_MASK(map
->bparam
.dma_bits
));
261 pr_err("failed to set dma_mask on device %s\n",
266 ret
= do_map_benchmark(map
);
269 * restore the original dma_mask as many devices' dma_mask are
270 * set by architectures, acpi, busses. When we bind them back
271 * to their original drivers, those drivers shouldn't see
272 * dma_mask changed by benchmark
274 dma_set_mask(map
->dev
, old_dma_mask
);
283 if (copy_to_user(argp
, &map
->bparam
, sizeof(map
->bparam
)))
289 static const struct file_operations map_benchmark_fops
= {
291 .unlocked_ioctl
= map_benchmark_ioctl
,
294 static void map_benchmark_remove_debugfs(void *data
)
296 struct map_benchmark_data
*map
= (struct map_benchmark_data
*)data
;
298 debugfs_remove(map
->debugfs
);
301 static int __map_benchmark_probe(struct device
*dev
)
303 struct dentry
*entry
;
304 struct map_benchmark_data
*map
;
307 map
= devm_kzalloc(dev
, sizeof(*map
), GFP_KERNEL
);
312 ret
= devm_add_action(dev
, map_benchmark_remove_debugfs
, map
);
314 pr_err("Can't add debugfs remove action\n");
319 * we only permit a device bound with this driver, 2nd probe
322 entry
= debugfs_create_file("dma_map_benchmark", 0600, NULL
, map
,
323 &map_benchmark_fops
);
325 return PTR_ERR(entry
);
326 map
->debugfs
= entry
;
331 static int map_benchmark_platform_probe(struct platform_device
*pdev
)
333 return __map_benchmark_probe(&pdev
->dev
);
336 static struct platform_driver map_benchmark_platform_driver
= {
338 .name
= "dma_map_benchmark",
340 .probe
= map_benchmark_platform_probe
,
344 map_benchmark_pci_probe(struct pci_dev
*pdev
, const struct pci_device_id
*id
)
346 return __map_benchmark_probe(&pdev
->dev
);
349 static struct pci_driver map_benchmark_pci_driver
= {
350 .name
= "dma_map_benchmark",
351 .probe
= map_benchmark_pci_probe
,
354 static int __init
map_benchmark_init(void)
358 ret
= pci_register_driver(&map_benchmark_pci_driver
);
362 ret
= platform_driver_register(&map_benchmark_platform_driver
);
364 pci_unregister_driver(&map_benchmark_pci_driver
);
371 static void __exit
map_benchmark_cleanup(void)
373 platform_driver_unregister(&map_benchmark_platform_driver
);
374 pci_unregister_driver(&map_benchmark_pci_driver
);
377 module_init(map_benchmark_init
);
378 module_exit(map_benchmark_cleanup
);
380 MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
381 MODULE_DESCRIPTION("dma_map benchmark driver");