2 * Intel MIC Platform Software Stack (MPSS)
4 * Copyright(c) 2014 Intel Corporation.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License, version 2, as
8 * published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
18 #include "../bus/scif_bus.h"
19 #include "scif_peer_bus.h"
20 #include "scif_main.h"
21 #include "scif_nodeqp.h"
25 ************************************************************************
26 * SCIF node Queue Pair (QP) setup flow:
28 * 1) SCIF driver gets probed with a scif_hw_dev via the scif_hw_bus
29 * 2) scif_setup_qp(..) allocates the local qp and calls
30 * scif_setup_qp_connect(..) which allocates and maps the local
31 * buffer for the inbound QP
32 * 3) The local node updates the device page with the DMA address of the QP
33 * 4) A delayed work is scheduled (qp_dwork) which periodically reads if
34 * the peer node has updated its QP DMA address
35 * 5) Once a valid non zero address is found in the QP DMA address field
36 * in the device page, the local node maps the remote node's QP,
37 * updates its outbound QP and sends a SCIF_INIT message to the peer
38 * 6) The SCIF_INIT message is received by the peer node QP interrupt bottom
39 * half handler by calling scif_init(..)
40 * 7) scif_init(..) registers a new SCIF peer node by calling
41 * scif_peer_register_device(..) which signifies the addition of a new
43 * 8) On the mgmt node, P2P network setup/teardown is initiated if all the
44 * remote nodes are online via scif_p2p_setup(..)
45 * 9) For P2P setup, the host maps the remote nodes' aperture and memory
46 * bars and sends a SCIF_NODE_ADD message to both nodes
47 * 10) As part of scif_nodeadd, both nodes set up their local inbound
48 * QPs and send a SCIF_NODE_ADD_ACK to the mgmt node
49 * 11) As part of scif_node_add_ack(..) the mgmt node forwards the
50 * SCIF_NODE_ADD_ACK to the remote nodes
51 * 12) As part of scif_node_add_ack(..) the remote nodes update their
52 * outbound QPs, make sure they can access memory on the remote node
53 * and then add a new SCIF peer node by calling
54 * scif_peer_register_device(..) which signifies the addition of a new
56 * 13) The SCIF network is now established across all nodes.
58 ************************************************************************
59 * SCIF node QP teardown flow (initiated by non mgmt node):
61 * 1) SCIF driver gets a remove callback with a scif_hw_dev via the scif_hw_bus
62 * 2) The device page QP DMA address field is updated with 0x0
63 * 3) A non mgmt node now cleans up all local data structures and sends a
64 * SCIF_EXIT message to the peer and waits for a SCIF_EXIT_ACK
65 * 4) As part of scif_exit(..) handling scif_disconnect_node(..) is called
66 * 5) scif_disconnect_node(..) sends a SCIF_NODE_REMOVE message to all the
67 * peers and waits for a SCIF_NODE_REMOVE_ACK
68 * 6) As part of scif_node_remove(..) a remote node unregisters the peer
69 * node from the SCIF network and sends a SCIF_NODE_REMOVE_ACK
70 * 7) When the mgmt node has received all the SCIF_NODE_REMOVE_ACKs
71 * it sends itself a node remove message whose handling cleans up local
72 * data structures and unregisters the peer node from the SCIF network
73 * 8) The mgmt node sends a SCIF_EXIT_ACK
74 * 9) Upon receipt of the SCIF_EXIT_ACK the node initiating the teardown
75 * completes the SCIF remove routine
76 * 10) The SCIF network is now torn down for the node initiating the
79 ************************************************************************
80 * SCIF node QP teardown flow (initiated by mgmt node):
82 * 1) SCIF driver gets a remove callback with a scif_hw_dev via the scif_hw_bus
83 * 2) The device page QP DMA address field is updated with 0x0
84 * 3) The mgmt node calls scif_disconnect_node(..)
85 * 4) scif_disconnect_node(..) sends a SCIF_NODE_REMOVE message to all the peers
86 * and waits for a SCIF_NODE_REMOVE_ACK
87 * 5) As part of scif_node_remove(..) a remote node unregisters the peer
88 * node from the SCIF network and sends a SCIF_NODE_REMOVE_ACK
89 * 6) When the mgmt node has received all the SCIF_NODE_REMOVE_ACKs
90 * it unregisters the peer node from the SCIF network
91 * 7) The mgmt node sends a SCIF_EXIT message and waits for a SCIF_EXIT_ACK.
92 * 8) A non mgmt node upon receipt of a SCIF_EXIT message calls scif_stop(..)
93 * which would clean up local data structures for all SCIF nodes and
94 * then send a SCIF_EXIT_ACK back to the mgmt node
95 * 9) Upon receipt of the SCIF_EXIT_ACK the the mgmt node sends itself a node
96 * remove message whose handling cleans up local data structures and
97 * destroys any P2P mappings.
98 * 10) The SCIF hardware device for which a remove callback was received is now
99 * disconnected from the SCIF network.
102 * Initializes "local" data structures for the QP. Allocates the QP
103 * ring buffer (rb) and initializes the "in bound" queue.
105 int scif_setup_qp_connect(struct scif_qp
*qp
, dma_addr_t
*qp_offset
,
106 int local_size
, struct scif_dev
*scifdev
)
108 void *local_q
= NULL
;
112 spin_lock_init(&qp
->send_lock
);
113 spin_lock_init(&qp
->recv_lock
);
115 local_q
= kzalloc(local_size
, GFP_KERNEL
);
120 err
= scif_map_single(&qp
->local_buf
, local_q
, scifdev
, local_size
);
124 * To setup the inbound_q, the buffer lives locally, the read pointer
125 * is remote and the write pointer is local.
127 scif_rb_init(&qp
->inbound_q
,
130 local_q
, get_count_order(local_size
));
132 * The read pointer is NULL initially and it is unsafe to use the ring
133 * buffer til this changes!
135 qp
->inbound_q
.read_ptr
= NULL
;
136 err
= scif_map_single(qp_offset
, qp
,
137 scifdev
, sizeof(struct scif_qp
));
140 qp
->local_qp
= *qp_offset
;
143 scif_unmap_single(qp
->local_buf
, scifdev
, local_size
);
150 /* When the other side has already done it's allocation, this is called */
151 int scif_setup_qp_accept(struct scif_qp
*qp
, dma_addr_t
*qp_offset
,
152 dma_addr_t phys
, int local_size
,
153 struct scif_dev
*scifdev
)
157 struct scif_qp
*remote_qp
;
161 spin_lock_init(&qp
->send_lock
);
162 spin_lock_init(&qp
->recv_lock
);
163 /* Start by figuring out where we need to point */
164 remote_qp
= scif_ioremap(phys
, sizeof(struct scif_qp
), scifdev
);
167 qp
->remote_qp
= remote_qp
;
168 if (qp
->remote_qp
->magic
!= SCIFEP_MAGIC
) {
172 qp
->remote_buf
= remote_qp
->local_buf
;
173 remote_size
= qp
->remote_qp
->inbound_q
.size
;
174 remote_q
= scif_ioremap(qp
->remote_buf
, remote_size
, scifdev
);
179 qp
->remote_qp
->local_write
= 0;
181 * To setup the outbound_q, the buffer lives in remote memory,
182 * the read pointer is local, the write pointer is remote
184 scif_rb_init(&qp
->outbound_q
,
186 &qp
->remote_qp
->local_write
,
188 get_count_order(remote_size
));
189 local_q
= kzalloc(local_size
, GFP_KERNEL
);
194 err
= scif_map_single(&qp
->local_buf
, local_q
, scifdev
, local_size
);
197 qp
->remote_qp
->local_read
= 0;
199 * To setup the inbound_q, the buffer lives locally, the read pointer
200 * is remote and the write pointer is local
202 scif_rb_init(&qp
->inbound_q
,
203 &qp
->remote_qp
->local_read
,
205 local_q
, get_count_order(local_size
));
206 err
= scif_map_single(qp_offset
, qp
, scifdev
,
207 sizeof(struct scif_qp
));
210 qp
->local_qp
= *qp_offset
;
213 scif_unmap_single(qp
->local_buf
, scifdev
, local_size
);
218 scif_iounmap(remote_q
, remote_size
, scifdev
);
219 qp
->outbound_q
.rb_base
= NULL
;
221 scif_iounmap(qp
->remote_qp
, sizeof(struct scif_qp
), scifdev
);
222 qp
->remote_qp
= NULL
;
226 int scif_setup_qp_connect_response(struct scif_dev
*scifdev
,
227 struct scif_qp
*qp
, u64 payload
)
232 phys_addr_t tmp_phys
;
234 qp
->remote_qp
= scif_ioremap(payload
, sizeof(struct scif_qp
), scifdev
);
236 if (!qp
->remote_qp
) {
241 if (qp
->remote_qp
->magic
!= SCIFEP_MAGIC
) {
242 dev_err(&scifdev
->sdev
->dev
,
243 "SCIFEP_MAGIC mismatch between self %d remote %d\n",
244 scif_dev
[scif_info
.nodeid
].node
, scifdev
->node
);
249 tmp_phys
= qp
->remote_qp
->local_buf
;
250 remote_size
= qp
->remote_qp
->inbound_q
.size
;
251 r_buf
= scif_ioremap(tmp_phys
, remote_size
, scifdev
);
257 scif_rb_init(&qp
->outbound_q
,
259 &qp
->remote_qp
->local_write
,
261 get_count_order(remote_size
));
263 * resetup the inbound_q now that we know where the
264 * inbound_read really is.
266 scif_rb_init(&qp
->inbound_q
,
267 &qp
->remote_qp
->local_read
,
269 qp
->inbound_q
.rb_base
,
270 get_count_order(qp
->inbound_q
.size
));
275 static __always_inline
void
276 scif_send_msg_intr(struct scif_dev
*scifdev
)
278 struct scif_hw_dev
*sdev
= scifdev
->sdev
;
280 if (scifdev_is_p2p(scifdev
))
281 sdev
->hw_ops
->send_p2p_intr(sdev
, scifdev
->rdb
, &scifdev
->mmio
);
283 sdev
->hw_ops
->send_intr(sdev
, scifdev
->rdb
);
286 int scif_qp_response(phys_addr_t phys
, struct scif_dev
*scifdev
)
291 err
= scif_setup_qp_connect_response(scifdev
, scifdev
->qpairs
, phys
);
294 * Now that everything is setup and mapped, we're ready
295 * to tell the peer about our queue's location
298 msg
.dst
.node
= scifdev
->node
;
299 err
= scif_nodeqp_send(scifdev
, &msg
);
304 void scif_send_exit(struct scif_dev
*scifdev
)
309 scifdev
->exit
= OP_IN_PROGRESS
;
311 msg
.src
.node
= scif_info
.nodeid
;
312 msg
.dst
.node
= scifdev
->node
;
313 ret
= scif_nodeqp_send(scifdev
, &msg
);
316 /* Wait for a SCIF_EXIT_ACK message */
317 wait_event_timeout(scif_info
.exitwq
, scifdev
->exit
== OP_COMPLETED
,
318 SCIF_NODE_ALIVE_TIMEOUT
);
320 scifdev
->exit
= OP_IDLE
;
323 int scif_setup_qp(struct scif_dev
*scifdev
)
329 local_size
= SCIF_NODE_QP_SIZE
;
331 qp
= kzalloc(sizeof(*qp
), GFP_KERNEL
);
336 qp
->magic
= SCIFEP_MAGIC
;
337 scifdev
->qpairs
= qp
;
338 err
= scif_setup_qp_connect(qp
, &scifdev
->qp_dma_addr
,
339 local_size
, scifdev
);
343 * We're as setup as we can be. The inbound_q is setup, w/o a usable
344 * outbound q. When we get a message, the read_ptr will be updated,
345 * and we will pull the message.
349 kfree(scifdev
->qpairs
);
350 scifdev
->qpairs
= NULL
;
354 static void scif_p2p_freesg(struct scatterlist
*sg
)
359 static struct scatterlist
*
360 scif_p2p_setsg(phys_addr_t pa
, int page_size
, int page_cnt
)
362 struct scatterlist
*sg
;
366 sg
= kcalloc(page_cnt
, sizeof(struct scatterlist
), GFP_KERNEL
);
369 sg_init_table(sg
, page_cnt
);
370 for (i
= 0; i
< page_cnt
; i
++) {
371 page
= pfn_to_page(pa
>> PAGE_SHIFT
);
372 sg_set_page(&sg
[i
], page
, page_size
, 0);
378 /* Init p2p mappings required to access peerdev from scifdev */
379 static struct scif_p2p_info
*
380 scif_init_p2p_info(struct scif_dev
*scifdev
, struct scif_dev
*peerdev
)
382 struct scif_p2p_info
*p2p
;
383 int num_mmio_pages
, num_aper_pages
, sg_page_shift
, err
, num_aper_chunks
;
384 struct scif_hw_dev
*psdev
= peerdev
->sdev
;
385 struct scif_hw_dev
*sdev
= scifdev
->sdev
;
387 num_mmio_pages
= psdev
->mmio
->len
>> PAGE_SHIFT
;
388 num_aper_pages
= psdev
->aper
->len
>> PAGE_SHIFT
;
390 p2p
= kzalloc(sizeof(*p2p
), GFP_KERNEL
);
393 p2p
->ppi_sg
[SCIF_PPI_MMIO
] = scif_p2p_setsg(psdev
->mmio
->pa
,
394 PAGE_SIZE
, num_mmio_pages
);
395 if (!p2p
->ppi_sg
[SCIF_PPI_MMIO
])
397 p2p
->sg_nentries
[SCIF_PPI_MMIO
] = num_mmio_pages
;
398 sg_page_shift
= get_order(min(psdev
->aper
->len
, (u64
)(1 << 30)));
399 num_aper_chunks
= num_aper_pages
>> (sg_page_shift
- PAGE_SHIFT
);
400 p2p
->ppi_sg
[SCIF_PPI_APER
] = scif_p2p_setsg(psdev
->aper
->pa
,
403 p2p
->sg_nentries
[SCIF_PPI_APER
] = num_aper_chunks
;
404 err
= dma_map_sg(&sdev
->dev
, p2p
->ppi_sg
[SCIF_PPI_MMIO
],
405 num_mmio_pages
, PCI_DMA_BIDIRECTIONAL
);
406 if (err
!= num_mmio_pages
)
408 err
= dma_map_sg(&sdev
->dev
, p2p
->ppi_sg
[SCIF_PPI_APER
],
409 num_aper_chunks
, PCI_DMA_BIDIRECTIONAL
);
410 if (err
!= num_aper_chunks
)
412 p2p
->ppi_da
[SCIF_PPI_MMIO
] = sg_dma_address(p2p
->ppi_sg
[SCIF_PPI_MMIO
]);
413 p2p
->ppi_da
[SCIF_PPI_APER
] = sg_dma_address(p2p
->ppi_sg
[SCIF_PPI_APER
]);
414 p2p
->ppi_len
[SCIF_PPI_MMIO
] = num_mmio_pages
;
415 p2p
->ppi_len
[SCIF_PPI_APER
] = num_aper_pages
;
416 p2p
->ppi_peer_id
= peerdev
->node
;
419 dma_unmap_sg(&sdev
->dev
, p2p
->ppi_sg
[SCIF_PPI_MMIO
],
420 p2p
->sg_nentries
[SCIF_PPI_MMIO
], DMA_BIDIRECTIONAL
);
422 scif_p2p_freesg(p2p
->ppi_sg
[SCIF_PPI_MMIO
]);
423 scif_p2p_freesg(p2p
->ppi_sg
[SCIF_PPI_APER
]);
430 * scif_node_connect: Respond to SCIF_NODE_CONNECT interrupt message
431 * @dst: Destination node
433 * Connect the src and dst node by setting up the p2p connection
434 * between them. Management node here acts like a proxy.
436 static void scif_node_connect(struct scif_dev
*scifdev
, int dst
)
438 struct scif_dev
*dev_j
= scifdev
;
439 struct scif_dev
*dev_i
= NULL
;
440 struct scif_p2p_info
*p2p_ij
= NULL
; /* bus addr for j from i */
441 struct scif_p2p_info
*p2p_ji
= NULL
; /* bus addr for i from j */
442 struct scif_p2p_info
*p2p
;
443 struct list_head
*pos
, *tmp
;
448 if (dst
< 1 || dst
> scif_info
.maxid
)
451 dev_i
= &scif_dev
[dst
];
453 if (!_scifdev_alive(dev_i
))
456 * If the p2p connection is already setup or in the process of setting
457 * up then just ignore this request. The requested node will get
458 * informed by SCIF_NODE_ADD_ACK or SCIF_NODE_ADD_NACK
460 if (!list_empty(&dev_i
->p2p
)) {
461 list_for_each_safe(pos
, tmp
, &dev_i
->p2p
) {
462 p2p
= list_entry(pos
, struct scif_p2p_info
, ppi_list
);
463 if (p2p
->ppi_peer_id
== dev_j
->node
)
467 p2p_ij
= scif_init_p2p_info(dev_i
, dev_j
);
470 p2p_ji
= scif_init_p2p_info(dev_j
, dev_i
);
473 list_add_tail(&p2p_ij
->ppi_list
, &dev_i
->p2p
);
474 list_add_tail(&p2p_ji
->ppi_list
, &dev_j
->p2p
);
477 * Send a SCIF_NODE_ADD to dev_i, pass it its bus address
480 msg
.uop
= SCIF_NODE_ADD
;
481 msg
.src
.node
= dev_j
->node
;
482 msg
.dst
.node
= dev_i
->node
;
484 msg
.payload
[0] = p2p_ji
->ppi_da
[SCIF_PPI_APER
];
485 msg
.payload
[1] = p2p_ij
->ppi_da
[SCIF_PPI_MMIO
];
486 msg
.payload
[2] = p2p_ij
->ppi_da
[SCIF_PPI_APER
];
487 msg
.payload
[3] = p2p_ij
->ppi_len
[SCIF_PPI_APER
] << PAGE_SHIFT
;
489 err
= scif_nodeqp_send(dev_i
, &msg
);
491 dev_err(&scifdev
->sdev
->dev
,
492 "%s %d error %d\n", __func__
, __LINE__
, err
);
496 /* Same as above but to dev_j */
497 msg
.uop
= SCIF_NODE_ADD
;
498 msg
.src
.node
= dev_i
->node
;
499 msg
.dst
.node
= dev_j
->node
;
501 tmppayload
= msg
.payload
[0];
502 msg
.payload
[0] = msg
.payload
[2];
503 msg
.payload
[2] = tmppayload
;
504 msg
.payload
[1] = p2p_ji
->ppi_da
[SCIF_PPI_MMIO
];
505 msg
.payload
[3] = p2p_ji
->ppi_len
[SCIF_PPI_APER
] << PAGE_SHIFT
;
507 scif_nodeqp_send(dev_j
, &msg
);
510 static void scif_p2p_setup(void)
514 if (!scif_info
.p2p_enable
)
517 for (i
= 1; i
<= scif_info
.maxid
; i
++)
518 if (!_scifdev_alive(&scif_dev
[i
]))
521 for (i
= 1; i
<= scif_info
.maxid
; i
++) {
522 for (j
= 1; j
<= scif_info
.maxid
; j
++) {
523 struct scif_dev
*scifdev
= &scif_dev
[i
];
527 scif_node_connect(scifdev
, j
);
532 void scif_qp_response_ack(struct work_struct
*work
)
534 struct scif_dev
*scifdev
= container_of(work
, struct scif_dev
,
536 struct scif_peer_dev
*spdev
;
538 /* Drop the INIT message if it has already been received */
539 if (_scifdev_alive(scifdev
))
542 spdev
= scif_peer_register_device(scifdev
);
546 if (scif_is_mgmt_node()) {
547 mutex_lock(&scif_info
.conflock
);
549 mutex_unlock(&scif_info
.conflock
);
553 static char *message_types
[] = {"BAD",
559 "SCIF_NODE_ADD_NACK",
571 "SCIF_GET_NODE_INFO"};
574 scif_display_message(struct scif_dev
*scifdev
, struct scifmsg
*msg
,
577 if (!scif_info
.en_msg_log
)
579 if (msg
->uop
> SCIF_MAX_MSG
) {
580 dev_err(&scifdev
->sdev
->dev
,
581 "%s: unknown msg type %d\n", label
, msg
->uop
);
584 dev_info(&scifdev
->sdev
->dev
,
585 "%s: msg type %s, src %d:%d, dest %d:%d payload 0x%llx:0x%llx:0x%llx:0x%llx\n",
586 label
, message_types
[msg
->uop
], msg
->src
.node
, msg
->src
.port
,
587 msg
->dst
.node
, msg
->dst
.port
, msg
->payload
[0], msg
->payload
[1],
588 msg
->payload
[2], msg
->payload
[3]);
591 int _scif_nodeqp_send(struct scif_dev
*scifdev
, struct scifmsg
*msg
)
593 struct scif_qp
*qp
= scifdev
->qpairs
;
594 int err
= -ENOMEM
, loop_cnt
= 0;
596 scif_display_message(scifdev
, msg
, "Sent");
601 spin_lock(&qp
->send_lock
);
603 while ((err
= scif_rb_write(&qp
->outbound_q
,
604 msg
, sizeof(struct scifmsg
)))) {
606 #define SCIF_NODEQP_SEND_TO_MSEC (3 * 1000)
607 if (loop_cnt
++ > (SCIF_NODEQP_SEND_TO_MSEC
)) {
613 scif_rb_commit(&qp
->outbound_q
);
614 spin_unlock(&qp
->send_lock
);
616 if (scifdev_self(scifdev
))
618 * For loopback we need to emulate an interrupt by
619 * queuing work for the queue handling real node
622 queue_work(scifdev
->intr_wq
, &scifdev
->intr_bh
);
624 scif_send_msg_intr(scifdev
);
628 dev_dbg(&scifdev
->sdev
->dev
,
629 "%s %d error %d uop %d\n",
630 __func__
, __LINE__
, err
, msg
->uop
);
635 * scif_nodeqp_send - Send a message on the node queue pair
636 * @scifdev: Scif Device.
637 * @msg: The message to be sent.
639 int scif_nodeqp_send(struct scif_dev
*scifdev
, struct scifmsg
*msg
)
642 struct device
*spdev
= NULL
;
644 if (msg
->uop
> SCIF_EXIT_ACK
) {
645 /* Dont send messages once the exit flow has begun */
646 if (OP_IDLE
!= scifdev
->exit
)
648 spdev
= scif_get_peer_dev(scifdev
);
650 err
= PTR_ERR(spdev
);
654 err
= _scif_nodeqp_send(scifdev
, msg
);
655 if (msg
->uop
> SCIF_EXIT_ACK
)
656 scif_put_peer_dev(spdev
);
663 * Work queue handler for servicing miscellaneous SCIF tasks.
665 * 1) Cleanup of zombie endpoints.
667 void scif_misc_handler(struct work_struct
*work
)
669 scif_cleanup_zombie_epd();
673 * scif_init() - Respond to SCIF_INIT interrupt message
674 * @scifdev: Remote SCIF device node
675 * @msg: Interrupt message
677 static __always_inline
void
678 scif_init(struct scif_dev
*scifdev
, struct scifmsg
*msg
)
681 * Allow the thread waiting for device page updates for the peer QP DMA
682 * address to complete initializing the inbound_q.
684 flush_delayed_work(&scifdev
->qp_dwork
);
686 * Delegate the peer device registration to a workqueue, otherwise if
687 * SCIF client probe (called during peer device registration) calls
688 * scif_connect(..), it will block the message processing thread causing
691 schedule_work(&scifdev
->init_msg_work
);
695 * scif_exit() - Respond to SCIF_EXIT interrupt message
696 * @scifdev: Remote SCIF device node
697 * @msg: Interrupt message
699 * This function stops the SCIF interface for the node which sent
700 * the SCIF_EXIT message and starts waiting for that node to
701 * resetup the queue pair again.
703 static __always_inline
void
704 scif_exit(struct scif_dev
*scifdev
, struct scifmsg
*unused
)
706 scifdev
->exit_ack_pending
= true;
707 if (scif_is_mgmt_node())
708 scif_disconnect_node(scifdev
->node
, false);
711 schedule_delayed_work(&scifdev
->qp_dwork
,
712 msecs_to_jiffies(1000));
716 * scif_exitack() - Respond to SCIF_EXIT_ACK interrupt message
717 * @scifdev: Remote SCIF device node
718 * @msg: Interrupt message
721 static __always_inline
void
722 scif_exit_ack(struct scif_dev
*scifdev
, struct scifmsg
*unused
)
724 scifdev
->exit
= OP_COMPLETED
;
725 wake_up(&scif_info
.exitwq
);
729 * scif_node_add() - Respond to SCIF_NODE_ADD interrupt message
730 * @scifdev: Remote SCIF device node
731 * @msg: Interrupt message
733 * When the mgmt node driver has finished initializing a MIC node queue pair it
734 * marks the node as online. It then looks for all currently online MIC cards
735 * and send a SCIF_NODE_ADD message to identify the ID of the new card for
736 * peer to peer initialization
738 * The local node allocates its incoming queue and sends its address in the
739 * SCIF_NODE_ADD_ACK message back to the mgmt node, the mgmt node "reflects"
740 * this message to the new node
742 static __always_inline
void
743 scif_node_add(struct scif_dev
*scifdev
, struct scifmsg
*msg
)
745 struct scif_dev
*newdev
;
746 dma_addr_t qp_offset
;
748 struct scif_hw_dev
*sdev
;
750 dev_dbg(&scifdev
->sdev
->dev
,
751 "Scifdev %d:%d received NODE_ADD msg for node %d\n",
752 scifdev
->node
, msg
->dst
.node
, msg
->src
.node
);
753 dev_dbg(&scifdev
->sdev
->dev
,
754 "Remote address for this node's aperture %llx\n",
756 newdev
= &scif_dev
[msg
->src
.node
];
757 newdev
->node
= msg
->src
.node
;
758 newdev
->sdev
= scif_dev
[SCIF_MGMT_NODE
].sdev
;
761 if (scif_setup_intr_wq(newdev
)) {
762 dev_err(&scifdev
->sdev
->dev
,
763 "failed to setup interrupts for %d\n", msg
->src
.node
);
764 goto interrupt_setup_error
;
766 newdev
->mmio
.va
= ioremap_nocache(msg
->payload
[1], sdev
->mmio
->len
);
767 if (!newdev
->mmio
.va
) {
768 dev_err(&scifdev
->sdev
->dev
,
769 "failed to map mmio for %d\n", msg
->src
.node
);
772 newdev
->qpairs
= kzalloc(sizeof(*newdev
->qpairs
), GFP_KERNEL
);
776 * Set the base address of the remote node's memory since it gets
779 newdev
->base_addr
= msg
->payload
[0];
781 qp_connect
= scif_setup_qp_connect(newdev
->qpairs
, &qp_offset
,
782 SCIF_NODE_QP_SIZE
, newdev
);
784 dev_err(&scifdev
->sdev
->dev
,
785 "failed to setup qp_connect %d\n", qp_connect
);
786 goto qp_connect_error
;
789 newdev
->db
= sdev
->hw_ops
->next_db(sdev
);
790 newdev
->cookie
= sdev
->hw_ops
->request_irq(sdev
, scif_intr_handler
,
793 if (IS_ERR(newdev
->cookie
))
794 goto qp_connect_error
;
795 newdev
->qpairs
->magic
= SCIFEP_MAGIC
;
796 newdev
->qpairs
->qp_state
= SCIF_QP_OFFLINE
;
798 msg
->uop
= SCIF_NODE_ADD_ACK
;
799 msg
->dst
.node
= msg
->src
.node
;
800 msg
->src
.node
= scif_info
.nodeid
;
801 msg
->payload
[0] = qp_offset
;
802 msg
->payload
[2] = newdev
->db
;
803 scif_nodeqp_send(&scif_dev
[SCIF_MGMT_NODE
], msg
);
806 kfree(newdev
->qpairs
);
807 newdev
->qpairs
= NULL
;
809 iounmap(newdev
->mmio
.va
);
810 newdev
->mmio
.va
= NULL
;
812 interrupt_setup_error
:
813 dev_err(&scifdev
->sdev
->dev
,
814 "node add failed for node %d\n", msg
->src
.node
);
815 msg
->uop
= SCIF_NODE_ADD_NACK
;
816 msg
->dst
.node
= msg
->src
.node
;
817 msg
->src
.node
= scif_info
.nodeid
;
818 scif_nodeqp_send(&scif_dev
[SCIF_MGMT_NODE
], msg
);
821 void scif_poll_qp_state(struct work_struct
*work
)
823 #define SCIF_NODE_QP_RETRY 100
824 #define SCIF_NODE_QP_TIMEOUT 100
825 struct scif_dev
*peerdev
= container_of(work
, struct scif_dev
,
827 struct scif_qp
*qp
= &peerdev
->qpairs
[0];
829 if (qp
->qp_state
!= SCIF_QP_ONLINE
||
830 qp
->remote_qp
->qp_state
!= SCIF_QP_ONLINE
) {
831 if (peerdev
->p2p_retry
++ == SCIF_NODE_QP_RETRY
) {
832 dev_err(&peerdev
->sdev
->dev
,
833 "Warning: QP check timeout with state %d\n",
837 schedule_delayed_work(&peerdev
->p2p_dwork
,
838 msecs_to_jiffies(SCIF_NODE_QP_TIMEOUT
));
841 scif_peer_register_device(peerdev
);
844 dev_err(&peerdev
->sdev
->dev
,
845 "%s %d remote node %d offline, state = 0x%x\n",
846 __func__
, __LINE__
, peerdev
->node
, qp
->qp_state
);
847 qp
->remote_qp
->qp_state
= SCIF_QP_OFFLINE
;
848 scif_cleanup_scifdev(peerdev
);
852 * scif_node_add_ack() - Respond to SCIF_NODE_ADD_ACK interrupt message
853 * @scifdev: Remote SCIF device node
854 * @msg: Interrupt message
856 * After a MIC node receives the SCIF_NODE_ADD_ACK message it send this
857 * message to the mgmt node to confirm the sequence is finished.
860 static __always_inline
void
861 scif_node_add_ack(struct scif_dev
*scifdev
, struct scifmsg
*msg
)
863 struct scif_dev
*peerdev
;
865 struct scif_dev
*dst_dev
= &scif_dev
[msg
->dst
.node
];
867 dev_dbg(&scifdev
->sdev
->dev
,
868 "Scifdev %d received SCIF_NODE_ADD_ACK msg src %d dst %d\n",
869 scifdev
->node
, msg
->src
.node
, msg
->dst
.node
);
870 dev_dbg(&scifdev
->sdev
->dev
,
871 "payload %llx %llx %llx %llx\n", msg
->payload
[0],
872 msg
->payload
[1], msg
->payload
[2], msg
->payload
[3]);
873 if (scif_is_mgmt_node()) {
875 * the lock serializes with scif_qp_response_ack. The mgmt node
876 * is forwarding the NODE_ADD_ACK message from src to dst we
877 * need to make sure that the dst has already received a
878 * NODE_ADD for src and setup its end of the qp to dst
880 mutex_lock(&scif_info
.conflock
);
881 msg
->payload
[1] = scif_info
.maxid
;
882 scif_nodeqp_send(dst_dev
, msg
);
883 mutex_unlock(&scif_info
.conflock
);
886 peerdev
= &scif_dev
[msg
->src
.node
];
887 peerdev
->sdev
= scif_dev
[SCIF_MGMT_NODE
].sdev
;
888 peerdev
->node
= msg
->src
.node
;
890 qp
= &peerdev
->qpairs
[0];
892 if ((scif_setup_qp_connect_response(peerdev
, &peerdev
->qpairs
[0],
895 peerdev
->rdb
= msg
->payload
[2];
896 qp
->remote_qp
->qp_state
= SCIF_QP_ONLINE
;
897 schedule_delayed_work(&peerdev
->p2p_dwork
, 0);
900 scif_cleanup_scifdev(peerdev
);
904 * scif_node_add_nack: Respond to SCIF_NODE_ADD_NACK interrupt message
905 * @msg: Interrupt message
907 * SCIF_NODE_ADD failed, so inform the waiting wq.
909 static __always_inline
void
910 scif_node_add_nack(struct scif_dev
*scifdev
, struct scifmsg
*msg
)
912 if (scif_is_mgmt_node()) {
913 struct scif_dev
*dst_dev
= &scif_dev
[msg
->dst
.node
];
915 dev_dbg(&scifdev
->sdev
->dev
,
916 "SCIF_NODE_ADD_NACK received from %d\n", scifdev
->node
);
917 scif_nodeqp_send(dst_dev
, msg
);
922 * scif_node_remove: Handle SCIF_NODE_REMOVE message
923 * @msg: Interrupt message
925 * Handle node removal.
927 static __always_inline
void
928 scif_node_remove(struct scif_dev
*scifdev
, struct scifmsg
*msg
)
930 int node
= msg
->payload
[0];
931 struct scif_dev
*scdev
= &scif_dev
[node
];
933 scdev
->node_remove_ack_pending
= true;
934 scif_handle_remove_node(node
);
938 * scif_node_remove_ack: Handle SCIF_NODE_REMOVE_ACK message
939 * @msg: Interrupt message
941 * The peer has acked a SCIF_NODE_REMOVE message.
943 static __always_inline
void
944 scif_node_remove_ack(struct scif_dev
*scifdev
, struct scifmsg
*msg
)
946 struct scif_dev
*sdev
= &scif_dev
[msg
->payload
[0]];
948 atomic_inc(&sdev
->disconn_rescnt
);
949 wake_up(&sdev
->disconn_wq
);
953 * scif_get_node_info: Respond to SCIF_GET_NODE_INFO interrupt message
954 * @msg: Interrupt message
956 * Retrieve node info i.e maxid and total from the mgmt node.
958 static __always_inline
void
959 scif_get_node_info_resp(struct scif_dev
*scifdev
, struct scifmsg
*msg
)
961 if (scif_is_mgmt_node()) {
962 swap(msg
->dst
.node
, msg
->src
.node
);
963 mutex_lock(&scif_info
.conflock
);
964 msg
->payload
[1] = scif_info
.maxid
;
965 msg
->payload
[2] = scif_info
.total
;
966 mutex_unlock(&scif_info
.conflock
);
967 scif_nodeqp_send(scifdev
, msg
);
969 struct completion
*node_info
=
970 (struct completion
*)msg
->payload
[3];
972 mutex_lock(&scif_info
.conflock
);
973 scif_info
.maxid
= msg
->payload
[1];
974 scif_info
.total
= msg
->payload
[2];
975 complete_all(node_info
);
976 mutex_unlock(&scif_info
.conflock
);
981 scif_msg_unknown(struct scif_dev
*scifdev
, struct scifmsg
*msg
)
983 /* Bogus Node Qp Message? */
984 dev_err(&scifdev
->sdev
->dev
,
985 "Unknown message 0x%xn scifdev->node 0x%x\n",
986 msg
->uop
, scifdev
->node
);
989 static void (*scif_intr_func
[SCIF_MAX_MSG
+ 1])
990 (struct scif_dev
*, struct scifmsg
*msg
) = {
991 scif_msg_unknown
, /* Error */
992 scif_init
, /* SCIF_INIT */
993 scif_exit
, /* SCIF_EXIT */
994 scif_exit_ack
, /* SCIF_EXIT_ACK */
995 scif_node_add
, /* SCIF_NODE_ADD */
996 scif_node_add_ack
, /* SCIF_NODE_ADD_ACK */
997 scif_node_add_nack
, /* SCIF_NODE_ADD_NACK */
998 scif_node_remove
, /* SCIF_NODE_REMOVE */
999 scif_node_remove_ack
, /* SCIF_NODE_REMOVE_ACK */
1000 scif_cnctreq
, /* SCIF_CNCT_REQ */
1001 scif_cnctgnt
, /* SCIF_CNCT_GNT */
1002 scif_cnctgnt_ack
, /* SCIF_CNCT_GNTACK */
1003 scif_cnctgnt_nack
, /* SCIF_CNCT_GNTNACK */
1004 scif_cnctrej
, /* SCIF_CNCT_REJ */
1005 scif_discnct
, /* SCIF_DISCNCT */
1006 scif_discnt_ack
, /* SCIF_DISCNT_ACK */
1007 scif_clientsend
, /* SCIF_CLIENT_SENT */
1008 scif_clientrcvd
, /* SCIF_CLIENT_RCVD */
1009 scif_get_node_info_resp
,/* SCIF_GET_NODE_INFO */
1013 * scif_nodeqp_msg_handler() - Common handler for node messages
1014 * @scifdev: Remote device to respond to
1015 * @qp: Remote memory pointer
1016 * @msg: The message to be handled.
1018 * This routine calls the appropriate routine to handle a Node Qp
1021 static int scif_max_msg_id
= SCIF_MAX_MSG
;
1024 scif_nodeqp_msg_handler(struct scif_dev
*scifdev
,
1025 struct scif_qp
*qp
, struct scifmsg
*msg
)
1027 scif_display_message(scifdev
, msg
, "Rcvd");
1029 if (msg
->uop
> (u32
)scif_max_msg_id
) {
1030 /* Bogus Node Qp Message? */
1031 dev_err(&scifdev
->sdev
->dev
,
1032 "Unknown message 0x%xn scifdev->node 0x%x\n",
1033 msg
->uop
, scifdev
->node
);
1037 scif_intr_func
[msg
->uop
](scifdev
, msg
);
1041 * scif_nodeqp_intrhandler() - Interrupt handler for node messages
1042 * @scifdev: Remote device to respond to
1043 * @qp: Remote memory pointer
1045 * This routine is triggered by the interrupt mechanism. It reads
1046 * messages from the node queue RB and calls the Node QP Message handling
1049 void scif_nodeqp_intrhandler(struct scif_dev
*scifdev
, struct scif_qp
*qp
)
1055 read_size
= scif_rb_get_next(&qp
->inbound_q
, &msg
, sizeof(msg
));
1058 scif_nodeqp_msg_handler(scifdev
, qp
, &msg
);
1060 * The node queue pair is unmapped so skip the read pointer
1061 * update after receipt of a SCIF_EXIT_ACK
1063 if (SCIF_EXIT_ACK
== msg
.uop
)
1065 scif_rb_update_read_ptr(&qp
->inbound_q
);
1070 * scif_loopb_wq_handler - Loopback Workqueue Handler.
1071 * @work: loop back work
1073 * This work queue routine is invoked by the loopback work queue handler.
1074 * It grabs the recv lock, dequeues any available messages from the head
1075 * of the loopback message list, calls the node QP message handler,
1076 * waits for it to return, then frees up this message and dequeues more
1077 * elements of the list if available.
1079 static void scif_loopb_wq_handler(struct work_struct
*unused
)
1081 struct scif_dev
*scifdev
= scif_info
.loopb_dev
;
1082 struct scif_qp
*qp
= scifdev
->qpairs
;
1083 struct scif_loopb_msg
*msg
;
1087 spin_lock(&qp
->recv_lock
);
1088 if (!list_empty(&scif_info
.loopb_recv_q
)) {
1089 msg
= list_first_entry(&scif_info
.loopb_recv_q
,
1090 struct scif_loopb_msg
,
1092 list_del(&msg
->list
);
1094 spin_unlock(&qp
->recv_lock
);
1097 scif_nodeqp_msg_handler(scifdev
, qp
, &msg
->msg
);
1104 * scif_loopb_msg_handler() - Workqueue handler for loopback messages.
1105 * @scifdev: SCIF device
1108 * This work queue routine is triggered when a loopback message is received.
1110 * We need special handling for receiving Node Qp messages on a loopback SCIF
1111 * device via two workqueues for receiving messages.
1113 * The reason we need the extra workqueue which is not required with *normal*
1114 * non-loopback SCIF devices is the potential classic deadlock described below:
1116 * Thread A tries to send a message on a loopback SCIF device and blocks since
1117 * there is no space in the RB while it has the send_lock held or another
1118 * lock called lock X for example.
1120 * Thread B: The Loopback Node QP message receive workqueue receives the message
1121 * and tries to send a message (eg an ACK) to the loopback SCIF device. It tries
1122 * to grab the send lock again or lock X and deadlocks with Thread A. The RB
1123 * cannot be drained any further due to this classic deadlock.
1125 * In order to avoid deadlocks as mentioned above we have an extra level of
1126 * indirection achieved by having two workqueues.
1127 * 1) The first workqueue whose handler is scif_loopb_msg_handler reads
1128 * messages from the Node QP RB, adds them to a list and queues work for the
1131 * 2) The second workqueue whose handler is scif_loopb_wq_handler dequeues
1132 * messages from the list, handles them, frees up the memory and dequeues
1133 * more elements from the list if possible.
1136 scif_loopb_msg_handler(struct scif_dev
*scifdev
, struct scif_qp
*qp
)
1139 struct scif_loopb_msg
*msg
;
1142 msg
= kmalloc(sizeof(*msg
), GFP_KERNEL
);
1145 read_size
= scif_rb_get_next(&qp
->inbound_q
, &msg
->msg
,
1146 sizeof(struct scifmsg
));
1147 if (read_size
!= sizeof(struct scifmsg
)) {
1149 scif_rb_update_read_ptr(&qp
->inbound_q
);
1152 spin_lock(&qp
->recv_lock
);
1153 list_add_tail(&msg
->list
, &scif_info
.loopb_recv_q
);
1154 spin_unlock(&qp
->recv_lock
);
1155 queue_work(scif_info
.loopb_wq
, &scif_info
.loopb_work
);
1156 scif_rb_update_read_ptr(&qp
->inbound_q
);
1157 } while (read_size
== sizeof(struct scifmsg
));
1162 * scif_setup_loopback_qp - One time setup work for Loopback Node Qp.
1163 * @scifdev: SCIF device
1165 * Sets up the required loopback workqueues, queue pairs and ring buffers
1167 int scif_setup_loopback_qp(struct scif_dev
*scifdev
)
1172 struct scif_peer_dev
*spdev
;
1174 err
= scif_setup_intr_wq(scifdev
);
1177 INIT_LIST_HEAD(&scif_info
.loopb_recv_q
);
1178 snprintf(scif_info
.loopb_wqname
, sizeof(scif_info
.loopb_wqname
),
1179 "SCIF LOOPB %d", scifdev
->node
);
1180 scif_info
.loopb_wq
=
1181 alloc_ordered_workqueue(scif_info
.loopb_wqname
, 0);
1182 if (!scif_info
.loopb_wq
) {
1186 INIT_WORK(&scif_info
.loopb_work
, scif_loopb_wq_handler
);
1187 /* Allocate Self Qpair */
1188 scifdev
->qpairs
= kzalloc(sizeof(*scifdev
->qpairs
), GFP_KERNEL
);
1189 if (!scifdev
->qpairs
) {
1191 goto destroy_loopb_wq
;
1194 qp
= scifdev
->qpairs
;
1195 qp
->magic
= SCIFEP_MAGIC
;
1196 spin_lock_init(&qp
->send_lock
);
1197 spin_lock_init(&qp
->recv_lock
);
1199 local_q
= kzalloc(SCIF_NODE_QP_SIZE
, GFP_KERNEL
);
1205 * For loopback the inbound_q and outbound_q are essentially the same
1206 * since the Node sends a message on the loopback interface to the
1207 * outbound_q which is then received on the inbound_q.
1209 scif_rb_init(&qp
->outbound_q
,
1212 local_q
, get_count_order(SCIF_NODE_QP_SIZE
));
1214 scif_rb_init(&qp
->inbound_q
,
1217 local_q
, get_count_order(SCIF_NODE_QP_SIZE
));
1218 scif_info
.nodeid
= scifdev
->node
;
1219 spdev
= scif_peer_register_device(scifdev
);
1220 if (IS_ERR(spdev
)) {
1221 err
= PTR_ERR(spdev
);
1224 scif_info
.loopb_dev
= scifdev
;
1229 kfree(scifdev
->qpairs
);
1231 destroy_workqueue(scif_info
.loopb_wq
);
1233 scif_destroy_intr_wq(scifdev
);
1239 * scif_destroy_loopback_qp - One time uninit work for Loopback Node Qp
1240 * @scifdev: SCIF device
1242 * Destroys the workqueues and frees up the Ring Buffer and Queue Pair memory.
1244 int scif_destroy_loopback_qp(struct scif_dev
*scifdev
)
1246 struct scif_peer_dev
*spdev
;
1249 spdev
= rcu_dereference(scifdev
->spdev
);
1252 scif_peer_unregister_device(spdev
);
1253 destroy_workqueue(scif_info
.loopb_wq
);
1254 scif_destroy_intr_wq(scifdev
);
1255 kfree(scifdev
->qpairs
->outbound_q
.rb_base
);
1256 kfree(scifdev
->qpairs
);
1257 scifdev
->sdev
= NULL
;
1258 scif_info
.loopb_dev
= NULL
;
1262 void scif_destroy_p2p(struct scif_dev
*scifdev
)
1264 struct scif_dev
*peer_dev
;
1265 struct scif_p2p_info
*p2p
;
1266 struct list_head
*pos
, *tmp
;
1269 mutex_lock(&scif_info
.conflock
);
1270 /* Free P2P mappings in the given node for all its peer nodes */
1271 list_for_each_safe(pos
, tmp
, &scifdev
->p2p
) {
1272 p2p
= list_entry(pos
, struct scif_p2p_info
, ppi_list
);
1273 dma_unmap_sg(&scifdev
->sdev
->dev
, p2p
->ppi_sg
[SCIF_PPI_MMIO
],
1274 p2p
->sg_nentries
[SCIF_PPI_MMIO
],
1276 dma_unmap_sg(&scifdev
->sdev
->dev
, p2p
->ppi_sg
[SCIF_PPI_APER
],
1277 p2p
->sg_nentries
[SCIF_PPI_APER
],
1279 scif_p2p_freesg(p2p
->ppi_sg
[SCIF_PPI_MMIO
]);
1280 scif_p2p_freesg(p2p
->ppi_sg
[SCIF_PPI_APER
]);
1285 /* Free P2P mapping created in the peer nodes for the given node */
1286 for (bd
= SCIF_MGMT_NODE
+ 1; bd
<= scif_info
.maxid
; bd
++) {
1287 peer_dev
= &scif_dev
[bd
];
1288 list_for_each_safe(pos
, tmp
, &peer_dev
->p2p
) {
1289 p2p
= list_entry(pos
, struct scif_p2p_info
, ppi_list
);
1290 if (p2p
->ppi_peer_id
== scifdev
->node
) {
1291 dma_unmap_sg(&peer_dev
->sdev
->dev
,
1292 p2p
->ppi_sg
[SCIF_PPI_MMIO
],
1293 p2p
->sg_nentries
[SCIF_PPI_MMIO
],
1295 dma_unmap_sg(&peer_dev
->sdev
->dev
,
1296 p2p
->ppi_sg
[SCIF_PPI_APER
],
1297 p2p
->sg_nentries
[SCIF_PPI_APER
],
1299 scif_p2p_freesg(p2p
->ppi_sg
[SCIF_PPI_MMIO
]);
1300 scif_p2p_freesg(p2p
->ppi_sg
[SCIF_PPI_APER
]);
1306 mutex_unlock(&scif_info
.conflock
);