1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2020 Hisilicon Limited.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/debugfs.h>
9 #include <linux/delay.h>
10 #include <linux/device.h>
11 #include <linux/dma-mapping.h>
12 #include <linux/kernel.h>
13 #include <linux/kthread.h>
14 #include <linux/math64.h>
15 #include <linux/module.h>
16 #include <linux/pci.h>
17 #include <linux/platform_device.h>
18 #include <linux/slab.h>
19 #include <linux/timekeeping.h>
21 #define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
22 #define DMA_MAP_MAX_THREADS 1024
23 #define DMA_MAP_MAX_SECONDS 300
25 #define DMA_MAP_BIDIRECTIONAL 0
26 #define DMA_MAP_TO_DEVICE 1
27 #define DMA_MAP_FROM_DEVICE 2
29 struct map_benchmark
{
30 __u64 avg_map_100ns
; /* average map latency in 100ns */
31 __u64 map_stddev
; /* standard deviation of map latency */
32 __u64 avg_unmap_100ns
; /* as above */
34 __u32 threads
; /* how many threads will do map/unmap in parallel */
35 __u32 seconds
; /* how long the test will last */
36 __s32 node
; /* which numa node this benchmark will run on */
37 __u32 dma_bits
; /* DMA addressing capability */
38 __u32 dma_dir
; /* DMA data direction */
39 __u64 expansion
[10]; /* For future use */
42 struct map_benchmark_data
{
43 struct map_benchmark bparam
;
45 struct dentry
*debugfs
;
46 enum dma_data_direction dir
;
47 atomic64_t sum_map_100ns
;
48 atomic64_t sum_unmap_100ns
;
49 atomic64_t sum_sq_map
;
50 atomic64_t sum_sq_unmap
;
54 static int map_benchmark_thread(void *data
)
58 struct map_benchmark_data
*map
= data
;
61 buf
= (void *)__get_free_page(GFP_KERNEL
);
65 while (!kthread_should_stop()) {
66 u64 map_100ns
, unmap_100ns
, map_sq
, unmap_sq
;
67 ktime_t map_stime
, map_etime
, unmap_stime
, unmap_etime
;
68 ktime_t map_delta
, unmap_delta
;
71 * for a non-coherent device, if we don't stain them in the
72 * cache, this will give an underestimate of the real-world
73 * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
74 * 66 means evertything goes well! 66 is lucky.
76 if (map
->dir
!= DMA_FROM_DEVICE
)
77 memset(buf
, 0x66, PAGE_SIZE
);
79 map_stime
= ktime_get();
80 dma_addr
= dma_map_single(map
->dev
, buf
, PAGE_SIZE
, map
->dir
);
81 if (unlikely(dma_mapping_error(map
->dev
, dma_addr
))) {
82 pr_err("dma_map_single failed on %s\n",
87 map_etime
= ktime_get();
88 map_delta
= ktime_sub(map_etime
, map_stime
);
90 unmap_stime
= ktime_get();
91 dma_unmap_single(map
->dev
, dma_addr
, PAGE_SIZE
, map
->dir
);
92 unmap_etime
= ktime_get();
93 unmap_delta
= ktime_sub(unmap_etime
, unmap_stime
);
95 /* calculate sum and sum of squares */
97 map_100ns
= div64_ul(map_delta
, 100);
98 unmap_100ns
= div64_ul(unmap_delta
, 100);
99 map_sq
= map_100ns
* map_100ns
;
100 unmap_sq
= unmap_100ns
* unmap_100ns
;
102 atomic64_add(map_100ns
, &map
->sum_map_100ns
);
103 atomic64_add(unmap_100ns
, &map
->sum_unmap_100ns
);
104 atomic64_add(map_sq
, &map
->sum_sq_map
);
105 atomic64_add(unmap_sq
, &map
->sum_sq_unmap
);
106 atomic64_inc(&map
->loops
);
110 free_page((unsigned long)buf
);
114 static int do_map_benchmark(struct map_benchmark_data
*map
)
116 struct task_struct
**tsk
;
117 int threads
= map
->bparam
.threads
;
118 int node
= map
->bparam
.node
;
119 const cpumask_t
*cpu_mask
= cpumask_of_node(node
);
124 tsk
= kmalloc_array(threads
, sizeof(*tsk
), GFP_KERNEL
);
128 get_device(map
->dev
);
130 for (i
= 0; i
< threads
; i
++) {
131 tsk
[i
] = kthread_create_on_node(map_benchmark_thread
, map
,
132 map
->bparam
.node
, "dma-map-benchmark/%d", i
);
133 if (IS_ERR(tsk
[i
])) {
134 pr_err("create dma_map thread failed\n");
135 ret
= PTR_ERR(tsk
[i
]);
139 if (node
!= NUMA_NO_NODE
)
140 kthread_bind_mask(tsk
[i
], cpu_mask
);
143 /* clear the old value in the previous benchmark */
144 atomic64_set(&map
->sum_map_100ns
, 0);
145 atomic64_set(&map
->sum_unmap_100ns
, 0);
146 atomic64_set(&map
->sum_sq_map
, 0);
147 atomic64_set(&map
->sum_sq_unmap
, 0);
148 atomic64_set(&map
->loops
, 0);
150 for (i
= 0; i
< threads
; i
++)
151 wake_up_process(tsk
[i
]);
153 msleep_interruptible(map
->bparam
.seconds
* 1000);
155 /* wait for the completion of benchmark threads */
156 for (i
= 0; i
< threads
; i
++) {
157 ret
= kthread_stop(tsk
[i
]);
162 loops
= atomic64_read(&map
->loops
);
163 if (likely(loops
> 0)) {
164 u64 map_variance
, unmap_variance
;
165 u64 sum_map
= atomic64_read(&map
->sum_map_100ns
);
166 u64 sum_unmap
= atomic64_read(&map
->sum_unmap_100ns
);
167 u64 sum_sq_map
= atomic64_read(&map
->sum_sq_map
);
168 u64 sum_sq_unmap
= atomic64_read(&map
->sum_sq_unmap
);
170 /* average latency */
171 map
->bparam
.avg_map_100ns
= div64_u64(sum_map
, loops
);
172 map
->bparam
.avg_unmap_100ns
= div64_u64(sum_unmap
, loops
);
174 /* standard deviation of latency */
175 map_variance
= div64_u64(sum_sq_map
, loops
) -
176 map
->bparam
.avg_map_100ns
*
177 map
->bparam
.avg_map_100ns
;
178 unmap_variance
= div64_u64(sum_sq_unmap
, loops
) -
179 map
->bparam
.avg_unmap_100ns
*
180 map
->bparam
.avg_unmap_100ns
;
181 map
->bparam
.map_stddev
= int_sqrt64(map_variance
);
182 map
->bparam
.unmap_stddev
= int_sqrt64(unmap_variance
);
186 put_device(map
->dev
);
191 static long map_benchmark_ioctl(struct file
*file
, unsigned int cmd
,
194 struct map_benchmark_data
*map
= file
->private_data
;
195 void __user
*argp
= (void __user
*)arg
;
200 if (copy_from_user(&map
->bparam
, argp
, sizeof(map
->bparam
)))
204 case DMA_MAP_BENCHMARK
:
205 if (map
->bparam
.threads
== 0 ||
206 map
->bparam
.threads
> DMA_MAP_MAX_THREADS
) {
207 pr_err("invalid thread number\n");
211 if (map
->bparam
.seconds
== 0 ||
212 map
->bparam
.seconds
> DMA_MAP_MAX_SECONDS
) {
213 pr_err("invalid duration seconds\n");
217 if (map
->bparam
.node
!= NUMA_NO_NODE
&&
218 !node_possible(map
->bparam
.node
)) {
219 pr_err("invalid numa node\n");
223 switch (map
->bparam
.dma_dir
) {
224 case DMA_MAP_BIDIRECTIONAL
:
225 map
->dir
= DMA_BIDIRECTIONAL
;
227 case DMA_MAP_FROM_DEVICE
:
228 map
->dir
= DMA_FROM_DEVICE
;
230 case DMA_MAP_TO_DEVICE
:
231 map
->dir
= DMA_TO_DEVICE
;
234 pr_err("invalid DMA direction\n");
238 old_dma_mask
= dma_get_mask(map
->dev
);
240 ret
= dma_set_mask(map
->dev
,
241 DMA_BIT_MASK(map
->bparam
.dma_bits
));
243 pr_err("failed to set dma_mask on device %s\n",
248 ret
= do_map_benchmark(map
);
251 * restore the original dma_mask as many devices' dma_mask are
252 * set by architectures, acpi, busses. When we bind them back
253 * to their original drivers, those drivers shouldn't see
254 * dma_mask changed by benchmark
256 dma_set_mask(map
->dev
, old_dma_mask
);
262 if (copy_to_user(argp
, &map
->bparam
, sizeof(map
->bparam
)))
268 static const struct file_operations map_benchmark_fops
= {
270 .unlocked_ioctl
= map_benchmark_ioctl
,
273 static void map_benchmark_remove_debugfs(void *data
)
275 struct map_benchmark_data
*map
= (struct map_benchmark_data
*)data
;
277 debugfs_remove(map
->debugfs
);
280 static int __map_benchmark_probe(struct device
*dev
)
282 struct dentry
*entry
;
283 struct map_benchmark_data
*map
;
286 map
= devm_kzalloc(dev
, sizeof(*map
), GFP_KERNEL
);
291 ret
= devm_add_action(dev
, map_benchmark_remove_debugfs
, map
);
293 pr_err("Can't add debugfs remove action\n");
298 * we only permit a device bound with this driver, 2nd probe
301 entry
= debugfs_create_file("dma_map_benchmark", 0600, NULL
, map
,
302 &map_benchmark_fops
);
304 return PTR_ERR(entry
);
305 map
->debugfs
= entry
;
310 static int map_benchmark_platform_probe(struct platform_device
*pdev
)
312 return __map_benchmark_probe(&pdev
->dev
);
315 static struct platform_driver map_benchmark_platform_driver
= {
317 .name
= "dma_map_benchmark",
319 .probe
= map_benchmark_platform_probe
,
323 map_benchmark_pci_probe(struct pci_dev
*pdev
, const struct pci_device_id
*id
)
325 return __map_benchmark_probe(&pdev
->dev
);
328 static struct pci_driver map_benchmark_pci_driver
= {
329 .name
= "dma_map_benchmark",
330 .probe
= map_benchmark_pci_probe
,
333 static int __init
map_benchmark_init(void)
337 ret
= pci_register_driver(&map_benchmark_pci_driver
);
341 ret
= platform_driver_register(&map_benchmark_platform_driver
);
343 pci_unregister_driver(&map_benchmark_pci_driver
);
350 static void __exit
map_benchmark_cleanup(void)
352 platform_driver_unregister(&map_benchmark_platform_driver
);
353 pci_unregister_driver(&map_benchmark_pci_driver
);
356 module_init(map_benchmark_init
);
357 module_exit(map_benchmark_cleanup
);
359 MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
360 MODULE_DESCRIPTION("dma_map benchmark driver");
361 MODULE_LICENSE("GPL");