2 * This file is provided under a dual BSD/GPLv2 license. When using or
3 * redistributing this file, you may do so under either license.
7 * Copyright(c) 2015 Intel Corporation. All rights reserved.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
15 * Copyright(c) 2015 Intel Corporation. All rights reserved.
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
21 * * Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * * Redistributions in binary form must reproduce the above copy
24 * notice, this list of conditions and the following disclaimer in
25 * the documentation and/or other materials provided with the
27 * * Neither the name of Intel Corporation nor the names of its
28 * contributors may be used to endorse or promote products derived
29 * from this software without specific prior written permission.
31 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
32 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
33 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
34 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
35 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
37 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
41 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * PCIe NTB Perf Linux driver
46 #include <linux/init.h>
47 #include <linux/kernel.h>
48 #include <linux/module.h>
49 #include <linux/kthread.h>
50 #include <linux/time.h>
51 #include <linux/timer.h>
52 #include <linux/dma-mapping.h>
53 #include <linux/pci.h>
54 #include <linux/slab.h>
55 #include <linux/spinlock.h>
56 #include <linux/debugfs.h>
57 #include <linux/dmaengine.h>
58 #include <linux/delay.h>
59 #include <linux/sizes.h>
60 #include <linux/ntb.h>
61 #include <linux/mutex.h>
63 #define DRIVER_NAME "ntb_perf"
64 #define DRIVER_DESCRIPTION "PCIe NTB Performance Measurement Tool"
66 #define DRIVER_LICENSE "Dual BSD/GPL"
67 #define DRIVER_VERSION "1.0"
68 #define DRIVER_AUTHOR "Dave Jiang <dave.jiang@intel.com>"
70 #define PERF_LINK_DOWN_TIMEOUT 10
71 #define PERF_VERSION 0xffff0001
72 #define MAX_THREADS 32
73 #define MAX_TEST_SIZE SZ_1M
75 #define DMA_OUT_RESOURCE_TO 50
76 #define DMA_RETRIES 20
77 #define SZ_4G (1ULL << 32)
78 #define MAX_SEG_ORDER 20 /* no larger than 1M for kmalloc buffer */
80 MODULE_LICENSE(DRIVER_LICENSE
);
81 MODULE_VERSION(DRIVER_VERSION
);
82 MODULE_AUTHOR(DRIVER_AUTHOR
);
83 MODULE_DESCRIPTION(DRIVER_DESCRIPTION
);
85 static struct dentry
*perf_debugfs_dir
;
87 static unsigned long max_mw_size
;
88 module_param(max_mw_size
, ulong
, 0644);
89 MODULE_PARM_DESC(max_mw_size
, "Limit size of large memory windows");
91 static unsigned int seg_order
= 19; /* 512K */
92 module_param(seg_order
, uint
, 0644);
93 MODULE_PARM_DESC(seg_order
, "size order [n^2] of buffer segment for testing");
95 static unsigned int run_order
= 32; /* 4G */
96 module_param(run_order
, uint
, 0644);
97 MODULE_PARM_DESC(run_order
, "size order [n^2] of total data to transfer");
99 static bool use_dma
; /* default to 0 */
100 module_param(use_dma
, bool, 0644);
101 MODULE_PARM_DESC(use_dma
, "Using DMA engine to measure performance");
104 phys_addr_t phys_addr
;
105 resource_size_t phys_size
;
106 resource_size_t xlat_align
;
107 resource_size_t xlat_align_size
;
118 struct task_struct
*thread
;
119 struct perf_ctx
*perf
;
121 struct dma_chan
*dma_chan
;
124 void *srcs
[MAX_SRCS
];
125 wait_queue_head_t
*wq
;
136 struct delayed_work link_work
;
137 wait_queue_head_t link_wq
;
138 struct dentry
*debugfs_node_dir
;
139 struct dentry
*debugfs_run
;
140 struct dentry
*debugfs_threads
;
142 /* mutex ensures only one set of threads run at once */
143 struct mutex run_mutex
;
144 struct pthr_ctx pthr_ctx
[MAX_THREADS
];
156 static void perf_link_event(void *ctx
)
158 struct perf_ctx
*perf
= ctx
;
160 if (ntb_link_is_up(perf
->ntb
, NULL
, NULL
) == 1) {
161 schedule_delayed_work(&perf
->link_work
, 2*HZ
);
163 dev_dbg(&perf
->ntb
->pdev
->dev
, "link down\n");
165 if (!perf
->link_is_up
)
166 cancel_delayed_work_sync(&perf
->link_work
);
168 perf
->link_is_up
= false;
172 static void perf_db_event(void *ctx
, int vec
)
174 struct perf_ctx
*perf
= ctx
;
175 u64 db_bits
, db_mask
;
177 db_mask
= ntb_db_vector_mask(perf
->ntb
, vec
);
178 db_bits
= ntb_db_read(perf
->ntb
);
180 dev_dbg(&perf
->ntb
->dev
, "doorbell vec %d mask %#llx bits %#llx\n",
181 vec
, db_mask
, db_bits
);
184 static const struct ntb_ctx_ops perf_ops
= {
185 .link_event
= perf_link_event
,
186 .db_event
= perf_db_event
,
189 static void perf_copy_callback(void *data
)
191 struct pthr_ctx
*pctx
= data
;
193 atomic_dec(&pctx
->dma_sync
);
196 static ssize_t
perf_copy(struct pthr_ctx
*pctx
, char __iomem
*dst
,
197 char *src
, size_t size
)
199 struct perf_ctx
*perf
= pctx
->perf
;
200 struct dma_async_tx_descriptor
*txd
;
201 struct dma_chan
*chan
= pctx
->dma_chan
;
202 struct dma_device
*device
;
203 struct dmaengine_unmap_data
*unmap
;
205 size_t src_off
, dst_off
;
206 struct perf_mw
*mw
= &perf
->mw
;
208 void __iomem
*dst_vaddr
;
213 memcpy_toio(dst
, src
, size
);
218 dev_err(&perf
->ntb
->dev
, "DMA engine does not exist\n");
222 device
= chan
->device
;
223 src_off
= (uintptr_t)src
& ~PAGE_MASK
;
224 dst_off
= (uintptr_t __force
)dst
& ~PAGE_MASK
;
226 if (!is_dma_copy_aligned(device
, src_off
, dst_off
, size
))
231 dst_phys
= mw
->phys_addr
+ (dst_vaddr
- vbase
);
233 unmap
= dmaengine_get_unmap_data(device
->dev
, 1, GFP_NOWAIT
);
238 unmap
->addr
[0] = dma_map_page(device
->dev
, virt_to_page(src
),
239 src_off
, size
, DMA_TO_DEVICE
);
240 if (dma_mapping_error(device
->dev
, unmap
->addr
[0]))
246 txd
= device
->device_prep_dma_memcpy(chan
, dst_phys
,
248 size
, DMA_PREP_INTERRUPT
);
250 set_current_state(TASK_INTERRUPTIBLE
);
251 schedule_timeout(DMA_OUT_RESOURCE_TO
);
253 } while (!txd
&& (++retries
< DMA_RETRIES
));
256 pctx
->dma_prep_err
++;
260 txd
->callback
= perf_copy_callback
;
261 txd
->callback_param
= pctx
;
262 dma_set_unmap(txd
, unmap
);
264 cookie
= dmaengine_submit(txd
);
265 if (dma_submit_error(cookie
))
268 atomic_inc(&pctx
->dma_sync
);
269 dma_async_issue_pending(chan
);
274 dmaengine_unmap_put(unmap
);
276 dmaengine_unmap_put(unmap
);
280 static int perf_move_data(struct pthr_ctx
*pctx
, char __iomem
*dst
, char *src
,
281 u64 buf_size
, u64 win_size
, u64 total
)
283 int chunks
, total_chunks
, i
;
284 int copied_chunks
= 0;
285 u64 copied
= 0, result
;
286 char __iomem
*tmp
= dst
;
288 ktime_t kstart
, kstop
, kdiff
;
289 unsigned long last_sleep
= jiffies
;
291 chunks
= div64_u64(win_size
, buf_size
);
292 total_chunks
= div64_u64(total
, buf_size
);
293 kstart
= ktime_get();
295 for (i
= 0; i
< total_chunks
; i
++) {
296 result
= perf_copy(pctx
, tmp
, src
, buf_size
);
299 if (copied_chunks
== chunks
) {
305 /* Probably should schedule every 5s to prevent soft hang. */
306 if (unlikely((jiffies
- last_sleep
) > 5 * HZ
)) {
307 last_sleep
= jiffies
;
308 set_current_state(TASK_INTERRUPTIBLE
);
312 if (unlikely(kthread_should_stop()))
317 pr_debug("%s: All DMA descriptors submitted\n", current
->comm
);
318 while (atomic_read(&pctx
->dma_sync
) != 0) {
319 if (kthread_should_stop())
326 kdiff
= ktime_sub(kstop
, kstart
);
327 diff_us
= ktime_to_us(kdiff
);
329 pr_debug("%s: copied %llu bytes\n", current
->comm
, copied
);
331 pr_debug("%s: lasted %llu usecs\n", current
->comm
, diff_us
);
333 perf
= div64_u64(copied
, diff_us
);
335 pr_debug("%s: MBytes/s: %llu\n", current
->comm
, perf
);
337 pctx
->copied
= copied
;
338 pctx
->diff_us
= diff_us
;
343 static bool perf_dma_filter_fn(struct dma_chan
*chan
, void *node
)
345 return dev_to_node(&chan
->dev
->device
) == (int)(unsigned long)node
;
348 static int ntb_perf_thread(void *data
)
350 struct pthr_ctx
*pctx
= data
;
351 struct perf_ctx
*perf
= pctx
->perf
;
352 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
353 struct perf_mw
*mw
= &perf
->mw
;
355 u64 win_size
, buf_size
, total
;
358 struct dma_chan
*dma_chan
= NULL
;
360 pr_debug("kthread %s starting...\n", current
->comm
);
362 node
= dev_to_node(&pdev
->dev
);
364 if (use_dma
&& !pctx
->dma_chan
) {
365 dma_cap_mask_t dma_mask
;
367 dma_cap_zero(dma_mask
);
368 dma_cap_set(DMA_MEMCPY
, dma_mask
);
369 dma_chan
= dma_request_channel(dma_mask
, perf_dma_filter_fn
,
370 (void *)(unsigned long)node
);
372 pr_warn("%s: cannot acquire DMA channel, quitting\n",
376 pctx
->dma_chan
= dma_chan
;
379 for (i
= 0; i
< MAX_SRCS
; i
++) {
380 pctx
->srcs
[i
] = kmalloc_node(MAX_TEST_SIZE
, GFP_KERNEL
, node
);
381 if (!pctx
->srcs
[i
]) {
387 win_size
= mw
->phys_size
;
388 buf_size
= 1ULL << seg_order
;
389 total
= 1ULL << run_order
;
391 if (buf_size
> MAX_TEST_SIZE
)
392 buf_size
= MAX_TEST_SIZE
;
394 dst
= (char __iomem
*)mw
->vbase
;
396 atomic_inc(&perf
->tsync
);
397 while (atomic_read(&perf
->tsync
) != perf
->perf_threads
)
400 src
= pctx
->srcs
[pctx
->src_idx
];
401 pctx
->src_idx
= (pctx
->src_idx
+ 1) & (MAX_SRCS
- 1);
403 rc
= perf_move_data(pctx
, dst
, src
, buf_size
, win_size
, total
);
405 atomic_dec(&perf
->tsync
);
408 pr_err("%s: failed\n", current
->comm
);
413 for (i
= 0; i
< MAX_SRCS
; i
++) {
414 kfree(pctx
->srcs
[i
]);
415 pctx
->srcs
[i
] = NULL
;
418 atomic_inc(&perf
->tdone
);
424 for (i
= 0; i
< MAX_SRCS
; i
++) {
425 kfree(pctx
->srcs
[i
]);
426 pctx
->srcs
[i
] = NULL
;
430 dma_release_channel(dma_chan
);
431 pctx
->dma_chan
= NULL
;
435 /* Wait until we are told to stop */
437 set_current_state(TASK_INTERRUPTIBLE
);
438 if (kthread_should_stop())
442 __set_current_state(TASK_RUNNING
);
447 static void perf_free_mw(struct perf_ctx
*perf
)
449 struct perf_mw
*mw
= &perf
->mw
;
450 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
455 ntb_mw_clear_trans(perf
->ntb
, 0);
456 dma_free_coherent(&pdev
->dev
, mw
->buf_size
,
457 mw
->virt_addr
, mw
->dma_addr
);
460 mw
->virt_addr
= NULL
;
463 static int perf_set_mw(struct perf_ctx
*perf
, resource_size_t size
)
465 struct perf_mw
*mw
= &perf
->mw
;
466 size_t xlat_size
, buf_size
;
472 xlat_size
= round_up(size
, mw
->xlat_align_size
);
473 buf_size
= round_up(size
, mw
->xlat_align
);
475 if (mw
->xlat_size
== xlat_size
)
481 mw
->xlat_size
= xlat_size
;
482 mw
->buf_size
= buf_size
;
484 mw
->virt_addr
= dma_alloc_coherent(&perf
->ntb
->pdev
->dev
, buf_size
,
485 &mw
->dma_addr
, GFP_KERNEL
);
486 if (!mw
->virt_addr
) {
491 rc
= ntb_mw_set_trans(perf
->ntb
, 0, mw
->dma_addr
, mw
->xlat_size
);
493 dev_err(&perf
->ntb
->dev
, "Unable to set mw0 translation\n");
501 static void perf_link_work(struct work_struct
*work
)
503 struct perf_ctx
*perf
=
504 container_of(work
, struct perf_ctx
, link_work
.work
);
505 struct ntb_dev
*ndev
= perf
->ntb
;
506 struct pci_dev
*pdev
= ndev
->pdev
;
511 dev_dbg(&perf
->ntb
->pdev
->dev
, "%s called\n", __func__
);
513 size
= perf
->mw
.phys_size
;
515 if (max_mw_size
&& size
> max_mw_size
)
518 ntb_peer_spad_write(ndev
, MW_SZ_HIGH
, upper_32_bits(size
));
519 ntb_peer_spad_write(ndev
, MW_SZ_LOW
, lower_32_bits(size
));
520 ntb_peer_spad_write(ndev
, VERSION
, PERF_VERSION
);
522 /* now read what peer wrote */
523 val
= ntb_spad_read(ndev
, VERSION
);
524 if (val
!= PERF_VERSION
) {
525 dev_dbg(&pdev
->dev
, "Remote version = %#x\n", val
);
529 val
= ntb_spad_read(ndev
, MW_SZ_HIGH
);
530 size
= (u64
)val
<< 32;
532 val
= ntb_spad_read(ndev
, MW_SZ_LOW
);
535 dev_dbg(&pdev
->dev
, "Remote MW size = %#llx\n", size
);
537 rc
= perf_set_mw(perf
, size
);
541 perf
->link_is_up
= true;
542 wake_up(&perf
->link_wq
);
550 if (ntb_link_is_up(ndev
, NULL
, NULL
) == 1)
551 schedule_delayed_work(&perf
->link_work
,
552 msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT
));
555 static int perf_setup_mw(struct ntb_dev
*ntb
, struct perf_ctx
*perf
)
562 rc
= ntb_mw_get_range(ntb
, 0, &mw
->phys_addr
, &mw
->phys_size
,
563 &mw
->xlat_align
, &mw
->xlat_align_size
);
567 perf
->mw
.vbase
= ioremap_wc(mw
->phys_addr
, mw
->phys_size
);
574 static ssize_t
debugfs_run_read(struct file
*filp
, char __user
*ubuf
,
575 size_t count
, loff_t
*offp
)
577 struct perf_ctx
*perf
= filp
->private_data
;
579 ssize_t ret
, out_off
= 0;
580 struct pthr_ctx
*pctx
;
587 buf
= kmalloc(1024, GFP_KERNEL
);
591 if (mutex_is_locked(&perf
->run_mutex
)) {
592 out_off
= snprintf(buf
, 64, "running\n");
596 for (i
= 0; i
< MAX_THREADS
; i
++) {
597 pctx
= &perf
->pthr_ctx
[i
];
599 if (pctx
->status
== -ENODATA
)
603 out_off
+= snprintf(buf
+ out_off
, 1024 - out_off
,
609 rate
= div64_u64(pctx
->copied
, pctx
->diff_us
);
610 out_off
+= snprintf(buf
+ out_off
, 1024 - out_off
,
611 "%d: copied %llu bytes in %llu usecs, %llu MBytes/s\n",
612 i
, pctx
->copied
, pctx
->diff_us
, rate
);
616 ret
= simple_read_from_buffer(ubuf
, count
, offp
, buf
, out_off
);
622 static void threads_cleanup(struct perf_ctx
*perf
)
624 struct pthr_ctx
*pctx
;
627 for (i
= 0; i
< MAX_THREADS
; i
++) {
628 pctx
= &perf
->pthr_ctx
[i
];
630 pctx
->status
= kthread_stop(pctx
->thread
);
636 static void perf_clear_thread_status(struct perf_ctx
*perf
)
640 for (i
= 0; i
< MAX_THREADS
; i
++)
641 perf
->pthr_ctx
[i
].status
= -ENODATA
;
644 static ssize_t
debugfs_run_write(struct file
*filp
, const char __user
*ubuf
,
645 size_t count
, loff_t
*offp
)
647 struct perf_ctx
*perf
= filp
->private_data
;
649 DECLARE_WAIT_QUEUE_HEAD(wq
);
651 if (wait_event_interruptible(perf
->link_wq
, perf
->link_is_up
))
654 if (perf
->perf_threads
== 0)
657 if (!mutex_trylock(&perf
->run_mutex
))
660 perf_clear_thread_status(perf
);
662 if (perf
->perf_threads
> MAX_THREADS
) {
663 perf
->perf_threads
= MAX_THREADS
;
664 pr_info("Reset total threads to: %u\n", MAX_THREADS
);
667 /* no greater than 1M */
668 if (seg_order
> MAX_SEG_ORDER
) {
669 seg_order
= MAX_SEG_ORDER
;
670 pr_info("Fix seg_order to %u\n", seg_order
);
673 if (run_order
< seg_order
) {
674 run_order
= seg_order
;
675 pr_info("Fix run_order to %u\n", run_order
);
678 node
= dev_to_node(&perf
->ntb
->pdev
->dev
);
679 atomic_set(&perf
->tdone
, 0);
681 /* launch kernel thread */
682 for (i
= 0; i
< perf
->perf_threads
; i
++) {
683 struct pthr_ctx
*pctx
;
685 pctx
= &perf
->pthr_ctx
[i
];
686 atomic_set(&pctx
->dma_sync
, 0);
690 kthread_create_on_node(ntb_perf_thread
,
692 node
, "ntb_perf %d", i
);
693 if (IS_ERR(pctx
->thread
)) {
697 wake_up_process(pctx
->thread
);
701 wait_event_interruptible(wq
,
702 atomic_read(&perf
->tdone
) == perf
->perf_threads
);
704 threads_cleanup(perf
);
705 mutex_unlock(&perf
->run_mutex
);
709 threads_cleanup(perf
);
710 mutex_unlock(&perf
->run_mutex
);
714 static const struct file_operations ntb_perf_debugfs_run
= {
715 .owner
= THIS_MODULE
,
717 .read
= debugfs_run_read
,
718 .write
= debugfs_run_write
,
721 static int perf_debugfs_setup(struct perf_ctx
*perf
)
723 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
725 if (!debugfs_initialized())
728 if (!perf_debugfs_dir
) {
729 perf_debugfs_dir
= debugfs_create_dir(KBUILD_MODNAME
, NULL
);
730 if (!perf_debugfs_dir
)
734 perf
->debugfs_node_dir
= debugfs_create_dir(pci_name(pdev
),
736 if (!perf
->debugfs_node_dir
)
739 perf
->debugfs_run
= debugfs_create_file("run", S_IRUSR
| S_IWUSR
,
740 perf
->debugfs_node_dir
, perf
,
741 &ntb_perf_debugfs_run
);
742 if (!perf
->debugfs_run
)
745 perf
->debugfs_threads
= debugfs_create_u8("threads", S_IRUSR
| S_IWUSR
,
746 perf
->debugfs_node_dir
,
747 &perf
->perf_threads
);
748 if (!perf
->debugfs_threads
)
754 static int perf_probe(struct ntb_client
*client
, struct ntb_dev
*ntb
)
756 struct pci_dev
*pdev
= ntb
->pdev
;
757 struct perf_ctx
*perf
;
761 if (ntb_spad_count(ntb
) < MAX_SPAD
) {
762 dev_err(&ntb
->dev
, "Not enough scratch pad registers for %s",
767 node
= dev_to_node(&pdev
->dev
);
769 perf
= kzalloc_node(sizeof(*perf
), GFP_KERNEL
, node
);
776 perf
->perf_threads
= 1;
777 atomic_set(&perf
->tsync
, 0);
778 mutex_init(&perf
->run_mutex
);
779 spin_lock_init(&perf
->db_lock
);
780 perf_setup_mw(ntb
, perf
);
781 init_waitqueue_head(&perf
->link_wq
);
782 INIT_DELAYED_WORK(&perf
->link_work
, perf_link_work
);
784 rc
= ntb_set_ctx(ntb
, perf
, &perf_ops
);
788 perf
->link_is_up
= false;
789 ntb_link_enable(ntb
, NTB_SPEED_AUTO
, NTB_WIDTH_AUTO
);
792 rc
= perf_debugfs_setup(perf
);
796 perf_clear_thread_status(perf
);
801 cancel_delayed_work_sync(&perf
->link_work
);
807 static void perf_remove(struct ntb_client
*client
, struct ntb_dev
*ntb
)
809 struct perf_ctx
*perf
= ntb
->ctx
;
812 dev_dbg(&perf
->ntb
->dev
, "%s called\n", __func__
);
814 mutex_lock(&perf
->run_mutex
);
816 cancel_delayed_work_sync(&perf
->link_work
);
819 ntb_link_disable(ntb
);
821 debugfs_remove_recursive(perf_debugfs_dir
);
822 perf_debugfs_dir
= NULL
;
825 for (i
= 0; i
< MAX_THREADS
; i
++) {
826 struct pthr_ctx
*pctx
= &perf
->pthr_ctx
[i
];
829 dma_release_channel(pctx
->dma_chan
);
836 static struct ntb_client perf_client
= {
839 .remove
= perf_remove
,
842 module_ntb_client(perf_client
);