2 * This file is provided under a dual BSD/GPLv2 license. When using or
3 * redistributing this file, you may do so under either license.
7 * Copyright(c) 2015 Intel Corporation. All rights reserved.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
15 * Copyright(c) 2015 Intel Corporation. All rights reserved.
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
21 * * Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * * Redistributions in binary form must reproduce the above copy
24 * notice, this list of conditions and the following disclaimer in
25 * the documentation and/or other materials provided with the
27 * * Neither the name of Intel Corporation nor the names of its
28 * contributors may be used to endorse or promote products derived
29 * from this software without specific prior written permission.
31 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
32 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
33 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
34 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
35 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
37 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
41 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * PCIe NTB Perf Linux driver
46 #include <linux/init.h>
47 #include <linux/kernel.h>
48 #include <linux/module.h>
49 #include <linux/kthread.h>
50 #include <linux/time.h>
51 #include <linux/timer.h>
52 #include <linux/dma-mapping.h>
53 #include <linux/pci.h>
54 #include <linux/slab.h>
55 #include <linux/spinlock.h>
56 #include <linux/debugfs.h>
57 #include <linux/dmaengine.h>
58 #include <linux/delay.h>
59 #include <linux/sizes.h>
60 #include <linux/ntb.h>
62 #define DRIVER_NAME "ntb_perf"
63 #define DRIVER_DESCRIPTION "PCIe NTB Performance Measurement Tool"
65 #define DRIVER_LICENSE "Dual BSD/GPL"
66 #define DRIVER_VERSION "1.0"
67 #define DRIVER_AUTHOR "Dave Jiang <dave.jiang@intel.com>"
69 #define PERF_LINK_DOWN_TIMEOUT 10
70 #define PERF_VERSION 0xffff0001
71 #define MAX_THREADS 32
72 #define MAX_TEST_SIZE SZ_1M
74 #define DMA_OUT_RESOURCE_TO 50
75 #define DMA_RETRIES 20
76 #define SZ_4G (1ULL << 32)
77 #define MAX_SEG_ORDER 20 /* no larger than 1M for kmalloc buffer */
79 MODULE_LICENSE(DRIVER_LICENSE
);
80 MODULE_VERSION(DRIVER_VERSION
);
81 MODULE_AUTHOR(DRIVER_AUTHOR
);
82 MODULE_DESCRIPTION(DRIVER_DESCRIPTION
);
84 static struct dentry
*perf_debugfs_dir
;
86 static unsigned int seg_order
= 19; /* 512K */
87 module_param(seg_order
, uint
, 0644);
88 MODULE_PARM_DESC(seg_order
, "size order [n^2] of buffer segment for testing");
90 static unsigned int run_order
= 32; /* 4G */
91 module_param(run_order
, uint
, 0644);
92 MODULE_PARM_DESC(run_order
, "size order [n^2] of total data to transfer");
94 static bool use_dma
; /* default to 0 */
95 module_param(use_dma
, bool, 0644);
96 MODULE_PARM_DESC(use_dma
, "Using DMA engine to measure performance");
99 phys_addr_t phys_addr
;
100 resource_size_t phys_size
;
101 resource_size_t xlat_align
;
102 resource_size_t xlat_align_size
;
113 struct task_struct
*thread
;
114 struct perf_ctx
*perf
;
116 struct dma_chan
*dma_chan
;
119 void *srcs
[MAX_SRCS
];
127 struct work_struct link_cleanup
;
128 struct delayed_work link_work
;
129 struct dentry
*debugfs_node_dir
;
130 struct dentry
*debugfs_run
;
131 struct dentry
*debugfs_threads
;
134 struct pthr_ctx pthr_ctx
[MAX_THREADS
];
147 static void perf_link_event(void *ctx
)
149 struct perf_ctx
*perf
= ctx
;
151 if (ntb_link_is_up(perf
->ntb
, NULL
, NULL
) == 1)
152 schedule_delayed_work(&perf
->link_work
, 2*HZ
);
154 schedule_work(&perf
->link_cleanup
);
157 static void perf_db_event(void *ctx
, int vec
)
159 struct perf_ctx
*perf
= ctx
;
160 u64 db_bits
, db_mask
;
162 db_mask
= ntb_db_vector_mask(perf
->ntb
, vec
);
163 db_bits
= ntb_db_read(perf
->ntb
);
165 dev_dbg(&perf
->ntb
->dev
, "doorbell vec %d mask %#llx bits %#llx\n",
166 vec
, db_mask
, db_bits
);
169 static const struct ntb_ctx_ops perf_ops
= {
170 .link_event
= perf_link_event
,
171 .db_event
= perf_db_event
,
174 static void perf_copy_callback(void *data
)
176 struct pthr_ctx
*pctx
= data
;
178 atomic_dec(&pctx
->dma_sync
);
181 static ssize_t
perf_copy(struct pthr_ctx
*pctx
, char __iomem
*dst
,
182 char *src
, size_t size
)
184 struct perf_ctx
*perf
= pctx
->perf
;
185 struct dma_async_tx_descriptor
*txd
;
186 struct dma_chan
*chan
= pctx
->dma_chan
;
187 struct dma_device
*device
;
188 struct dmaengine_unmap_data
*unmap
;
190 size_t src_off
, dst_off
;
191 struct perf_mw
*mw
= &perf
->mw
;
193 void __iomem
*dst_vaddr
;
198 memcpy_toio(dst
, src
, size
);
203 dev_err(&perf
->ntb
->dev
, "DMA engine does not exist\n");
207 device
= chan
->device
;
208 src_off
= (uintptr_t)src
& ~PAGE_MASK
;
209 dst_off
= (uintptr_t __force
)dst
& ~PAGE_MASK
;
211 if (!is_dma_copy_aligned(device
, src_off
, dst_off
, size
))
216 dst_phys
= mw
->phys_addr
+ (dst_vaddr
- vbase
);
218 unmap
= dmaengine_get_unmap_data(device
->dev
, 1, GFP_NOWAIT
);
223 unmap
->addr
[0] = dma_map_page(device
->dev
, virt_to_page(src
),
224 src_off
, size
, DMA_TO_DEVICE
);
225 if (dma_mapping_error(device
->dev
, unmap
->addr
[0]))
231 txd
= device
->device_prep_dma_memcpy(chan
, dst_phys
,
233 size
, DMA_PREP_INTERRUPT
);
235 set_current_state(TASK_INTERRUPTIBLE
);
236 schedule_timeout(DMA_OUT_RESOURCE_TO
);
238 } while (!txd
&& (++retries
< DMA_RETRIES
));
241 pctx
->dma_prep_err
++;
245 txd
->callback
= perf_copy_callback
;
246 txd
->callback_param
= pctx
;
247 dma_set_unmap(txd
, unmap
);
249 cookie
= dmaengine_submit(txd
);
250 if (dma_submit_error(cookie
))
253 atomic_inc(&pctx
->dma_sync
);
254 dma_async_issue_pending(chan
);
259 dmaengine_unmap_put(unmap
);
261 dmaengine_unmap_put(unmap
);
265 static int perf_move_data(struct pthr_ctx
*pctx
, char __iomem
*dst
, char *src
,
266 u64 buf_size
, u64 win_size
, u64 total
)
268 int chunks
, total_chunks
, i
;
269 int copied_chunks
= 0;
270 u64 copied
= 0, result
;
271 char __iomem
*tmp
= dst
;
273 ktime_t kstart
, kstop
, kdiff
;
275 chunks
= div64_u64(win_size
, buf_size
);
276 total_chunks
= div64_u64(total
, buf_size
);
277 kstart
= ktime_get();
279 for (i
= 0; i
< total_chunks
; i
++) {
280 result
= perf_copy(pctx
, tmp
, src
, buf_size
);
283 if (copied_chunks
== chunks
) {
289 /* Probably should schedule every 4GB to prevent soft hang. */
290 if (((copied
% SZ_4G
) == 0) && !use_dma
) {
291 set_current_state(TASK_INTERRUPTIBLE
);
297 pr_info("%s: All DMA descriptors submitted\n", current
->comm
);
298 while (atomic_read(&pctx
->dma_sync
) != 0)
303 kdiff
= ktime_sub(kstop
, kstart
);
304 diff_us
= ktime_to_us(kdiff
);
306 pr_info("%s: copied %llu bytes\n", current
->comm
, copied
);
308 pr_info("%s: lasted %llu usecs\n", current
->comm
, diff_us
);
310 perf
= div64_u64(copied
, diff_us
);
312 pr_info("%s: MBytes/s: %llu\n", current
->comm
, perf
);
317 static bool perf_dma_filter_fn(struct dma_chan
*chan
, void *node
)
319 return dev_to_node(&chan
->dev
->device
) == (int)(unsigned long)node
;
322 static int ntb_perf_thread(void *data
)
324 struct pthr_ctx
*pctx
= data
;
325 struct perf_ctx
*perf
= pctx
->perf
;
326 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
327 struct perf_mw
*mw
= &perf
->mw
;
329 u64 win_size
, buf_size
, total
;
332 struct dma_chan
*dma_chan
= NULL
;
334 pr_info("kthread %s starting...\n", current
->comm
);
336 node
= dev_to_node(&pdev
->dev
);
338 if (use_dma
&& !pctx
->dma_chan
) {
339 dma_cap_mask_t dma_mask
;
341 dma_cap_zero(dma_mask
);
342 dma_cap_set(DMA_MEMCPY
, dma_mask
);
343 dma_chan
= dma_request_channel(dma_mask
, perf_dma_filter_fn
,
344 (void *)(unsigned long)node
);
346 pr_warn("%s: cannot acquire DMA channel, quitting\n",
350 pctx
->dma_chan
= dma_chan
;
353 for (i
= 0; i
< MAX_SRCS
; i
++) {
354 pctx
->srcs
[i
] = kmalloc_node(MAX_TEST_SIZE
, GFP_KERNEL
, node
);
355 if (!pctx
->srcs
[i
]) {
361 win_size
= mw
->phys_size
;
362 buf_size
= 1ULL << seg_order
;
363 total
= 1ULL << run_order
;
365 if (buf_size
> MAX_TEST_SIZE
)
366 buf_size
= MAX_TEST_SIZE
;
368 dst
= (char __iomem
*)mw
->vbase
;
370 atomic_inc(&perf
->tsync
);
371 while (atomic_read(&perf
->tsync
) != perf
->perf_threads
)
374 src
= pctx
->srcs
[pctx
->src_idx
];
375 pctx
->src_idx
= (pctx
->src_idx
+ 1) & (MAX_SRCS
- 1);
377 rc
= perf_move_data(pctx
, dst
, src
, buf_size
, win_size
, total
);
379 atomic_dec(&perf
->tsync
);
382 pr_err("%s: failed\n", current
->comm
);
387 for (i
= 0; i
< MAX_SRCS
; i
++) {
388 kfree(pctx
->srcs
[i
]);
389 pctx
->srcs
[i
] = NULL
;
395 for (i
= 0; i
< MAX_SRCS
; i
++) {
396 kfree(pctx
->srcs
[i
]);
397 pctx
->srcs
[i
] = NULL
;
401 dma_release_channel(dma_chan
);
402 pctx
->dma_chan
= NULL
;
408 static void perf_free_mw(struct perf_ctx
*perf
)
410 struct perf_mw
*mw
= &perf
->mw
;
411 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
416 ntb_mw_clear_trans(perf
->ntb
, 0);
417 dma_free_coherent(&pdev
->dev
, mw
->buf_size
,
418 mw
->virt_addr
, mw
->dma_addr
);
421 mw
->virt_addr
= NULL
;
424 static int perf_set_mw(struct perf_ctx
*perf
, resource_size_t size
)
426 struct perf_mw
*mw
= &perf
->mw
;
427 size_t xlat_size
, buf_size
;
433 xlat_size
= round_up(size
, mw
->xlat_align_size
);
434 buf_size
= round_up(size
, mw
->xlat_align
);
436 if (mw
->xlat_size
== xlat_size
)
442 mw
->xlat_size
= xlat_size
;
443 mw
->buf_size
= buf_size
;
445 mw
->virt_addr
= dma_alloc_coherent(&perf
->ntb
->pdev
->dev
, buf_size
,
446 &mw
->dma_addr
, GFP_KERNEL
);
447 if (!mw
->virt_addr
) {
452 rc
= ntb_mw_set_trans(perf
->ntb
, 0, mw
->dma_addr
, mw
->xlat_size
);
454 dev_err(&perf
->ntb
->dev
, "Unable to set mw0 translation\n");
462 static void perf_link_work(struct work_struct
*work
)
464 struct perf_ctx
*perf
=
465 container_of(work
, struct perf_ctx
, link_work
.work
);
466 struct ntb_dev
*ndev
= perf
->ntb
;
467 struct pci_dev
*pdev
= ndev
->pdev
;
472 dev_dbg(&perf
->ntb
->pdev
->dev
, "%s called\n", __func__
);
474 size
= perf
->mw
.phys_size
;
475 ntb_peer_spad_write(ndev
, MW_SZ_HIGH
, upper_32_bits(size
));
476 ntb_peer_spad_write(ndev
, MW_SZ_LOW
, lower_32_bits(size
));
477 ntb_peer_spad_write(ndev
, VERSION
, PERF_VERSION
);
479 /* now read what peer wrote */
480 val
= ntb_spad_read(ndev
, VERSION
);
481 if (val
!= PERF_VERSION
) {
482 dev_dbg(&pdev
->dev
, "Remote version = %#x\n", val
);
486 val
= ntb_spad_read(ndev
, MW_SZ_HIGH
);
487 size
= (u64
)val
<< 32;
489 val
= ntb_spad_read(ndev
, MW_SZ_LOW
);
492 dev_dbg(&pdev
->dev
, "Remote MW size = %#llx\n", size
);
494 rc
= perf_set_mw(perf
, size
);
498 perf
->link_is_up
= true;
506 if (ntb_link_is_up(ndev
, NULL
, NULL
) == 1)
507 schedule_delayed_work(&perf
->link_work
,
508 msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT
));
511 static void perf_link_cleanup(struct work_struct
*work
)
513 struct perf_ctx
*perf
= container_of(work
,
517 dev_dbg(&perf
->ntb
->pdev
->dev
, "%s called\n", __func__
);
519 if (!perf
->link_is_up
)
520 cancel_delayed_work_sync(&perf
->link_work
);
523 static int perf_setup_mw(struct ntb_dev
*ntb
, struct perf_ctx
*perf
)
530 rc
= ntb_mw_get_range(ntb
, 0, &mw
->phys_addr
, &mw
->phys_size
,
531 &mw
->xlat_align
, &mw
->xlat_align_size
);
535 perf
->mw
.vbase
= ioremap_wc(mw
->phys_addr
, mw
->phys_size
);
542 static ssize_t
debugfs_run_read(struct file
*filp
, char __user
*ubuf
,
543 size_t count
, loff_t
*offp
)
545 struct perf_ctx
*perf
= filp
->private_data
;
547 ssize_t ret
, out_offset
;
552 buf
= kmalloc(64, GFP_KERNEL
);
555 out_offset
= snprintf(buf
, 64, "%d\n", perf
->run
);
556 ret
= simple_read_from_buffer(ubuf
, count
, offp
, buf
, out_offset
);
562 static void threads_cleanup(struct perf_ctx
*perf
)
564 struct pthr_ctx
*pctx
;
568 for (i
= 0; i
< MAX_THREADS
; i
++) {
569 pctx
= &perf
->pthr_ctx
[i
];
571 kthread_stop(pctx
->thread
);
577 static ssize_t
debugfs_run_write(struct file
*filp
, const char __user
*ubuf
,
578 size_t count
, loff_t
*offp
)
580 struct perf_ctx
*perf
= filp
->private_data
;
583 if (!perf
->link_is_up
)
586 if (perf
->perf_threads
== 0)
589 if (atomic_read(&perf
->tsync
) == 0)
593 threads_cleanup(perf
);
597 if (perf
->perf_threads
> MAX_THREADS
) {
598 perf
->perf_threads
= MAX_THREADS
;
599 pr_info("Reset total threads to: %u\n", MAX_THREADS
);
602 /* no greater than 1M */
603 if (seg_order
> MAX_SEG_ORDER
) {
604 seg_order
= MAX_SEG_ORDER
;
605 pr_info("Fix seg_order to %u\n", seg_order
);
608 if (run_order
< seg_order
) {
609 run_order
= seg_order
;
610 pr_info("Fix run_order to %u\n", run_order
);
613 node
= dev_to_node(&perf
->ntb
->pdev
->dev
);
614 /* launch kernel thread */
615 for (i
= 0; i
< perf
->perf_threads
; i
++) {
616 struct pthr_ctx
*pctx
;
618 pctx
= &perf
->pthr_ctx
[i
];
619 atomic_set(&pctx
->dma_sync
, 0);
622 kthread_create_on_node(ntb_perf_thread
,
624 node
, "ntb_perf %d", i
);
625 if (IS_ERR(pctx
->thread
)) {
629 wake_up_process(pctx
->thread
);
631 if (perf
->run
== false)
640 threads_cleanup(perf
);
644 static const struct file_operations ntb_perf_debugfs_run
= {
645 .owner
= THIS_MODULE
,
647 .read
= debugfs_run_read
,
648 .write
= debugfs_run_write
,
651 static int perf_debugfs_setup(struct perf_ctx
*perf
)
653 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
655 if (!debugfs_initialized())
658 if (!perf_debugfs_dir
) {
659 perf_debugfs_dir
= debugfs_create_dir(KBUILD_MODNAME
, NULL
);
660 if (!perf_debugfs_dir
)
664 perf
->debugfs_node_dir
= debugfs_create_dir(pci_name(pdev
),
666 if (!perf
->debugfs_node_dir
)
669 perf
->debugfs_run
= debugfs_create_file("run", S_IRUSR
| S_IWUSR
,
670 perf
->debugfs_node_dir
, perf
,
671 &ntb_perf_debugfs_run
);
672 if (!perf
->debugfs_run
)
675 perf
->debugfs_threads
= debugfs_create_u8("threads", S_IRUSR
| S_IWUSR
,
676 perf
->debugfs_node_dir
,
677 &perf
->perf_threads
);
678 if (!perf
->debugfs_threads
)
684 static int perf_probe(struct ntb_client
*client
, struct ntb_dev
*ntb
)
686 struct pci_dev
*pdev
= ntb
->pdev
;
687 struct perf_ctx
*perf
;
691 node
= dev_to_node(&pdev
->dev
);
693 perf
= kzalloc_node(sizeof(*perf
), GFP_KERNEL
, node
);
700 perf
->perf_threads
= 1;
701 atomic_set(&perf
->tsync
, 0);
703 spin_lock_init(&perf
->db_lock
);
704 perf_setup_mw(ntb
, perf
);
705 INIT_DELAYED_WORK(&perf
->link_work
, perf_link_work
);
706 INIT_WORK(&perf
->link_cleanup
, perf_link_cleanup
);
708 rc
= ntb_set_ctx(ntb
, perf
, &perf_ops
);
712 perf
->link_is_up
= false;
713 ntb_link_enable(ntb
, NTB_SPEED_AUTO
, NTB_WIDTH_AUTO
);
716 rc
= perf_debugfs_setup(perf
);
723 cancel_delayed_work_sync(&perf
->link_work
);
724 cancel_work_sync(&perf
->link_cleanup
);
730 static void perf_remove(struct ntb_client
*client
, struct ntb_dev
*ntb
)
732 struct perf_ctx
*perf
= ntb
->ctx
;
735 dev_dbg(&perf
->ntb
->dev
, "%s called\n", __func__
);
737 cancel_delayed_work_sync(&perf
->link_work
);
738 cancel_work_sync(&perf
->link_cleanup
);
741 ntb_link_disable(ntb
);
743 debugfs_remove_recursive(perf_debugfs_dir
);
744 perf_debugfs_dir
= NULL
;
747 for (i
= 0; i
< MAX_THREADS
; i
++) {
748 struct pthr_ctx
*pctx
= &perf
->pthr_ctx
[i
];
751 dma_release_channel(pctx
->dma_chan
);
758 static struct ntb_client perf_client
= {
761 .remove
= perf_remove
,
764 module_ntb_client(perf_client
);