Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / usr / src / uts / common / io / rsm / rsm.c
blob7bf6d702a13d978656158c3040fe31f162f6b631
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2012 Milan Jurik. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 * Copyright 2017 Joyent, Inc.
31 * Overview of the RSM Kernel Agent:
32 * ---------------------------------
34 * rsm.c constitutes the implementation of the RSM kernel agent. The RSM
35 * kernel agent is a pseudo device driver which makes use of the RSMPI
36 * interface on behalf of the RSMAPI user library.
38 * The kernel agent functionality can be categorized into the following
39 * components:
40 * 1. Driver Infrastructure
41 * 2. Export/Import Segment Management
42 * 3. Internal resource allocation/deallocation
44 * The driver infrastructure includes the basic module loading entry points
45 * like _init, _info, _fini to load, unload and report information about
46 * the driver module. The driver infrastructure also includes the
47 * autoconfiguration entry points namely, attach, detach and getinfo for
48 * the device autoconfiguration.
50 * The kernel agent is a pseudo character device driver and exports
51 * a cb_ops structure which defines the driver entry points for character
52 * device access. This includes the open and close entry points. The
53 * other entry points provided include ioctl, devmap and segmap and chpoll.
54 * read and write entry points are not used since the device is memory
55 * mapped. Also ddi_prop_op is used for the prop_op entry point.
57 * The ioctl entry point supports a number of commands, which are used by
58 * the RSMAPI library in order to export and import segments. These
59 * commands include commands for binding and rebinding the physical pages
60 * allocated to the virtual address range, publishing the export segment,
61 * unpublishing and republishing an export segment, creating an
62 * import segment and a virtual connection from this import segment to
63 * an export segment, performing scatter-gather data transfer, barrier
64 * operations.
67 * Export and Import segments:
68 * ---------------------------
70 * In order to create an RSM export segment a process allocates a range in its
71 * virtual address space for the segment using standard Solaris interfaces.
72 * The process then calls RSMAPI, which in turn makes an ioctl call to the
73 * RSM kernel agent for an allocation of physical memory pages and for
74 * creation of the export segment by binding these pages to the virtual
75 * address range. These pages are locked in memory so that remote accesses
76 * are always applied to the correct page. Then the RSM segment is published,
77 * again via RSMAPI making an ioctl to the RSM kernel agent, and a segment id
78 * is assigned to it.
80 * In order to import a published RSM segment, RSMAPI creates an import
81 * segment and forms a virtual connection across the interconnect to the
82 * export segment, via an ioctl into the kernel agent with the connect
83 * command. The import segment setup is completed by mapping the
84 * local device memory into the importers virtual address space. The
85 * mapping of the import segment is handled by the segmap/devmap
86 * infrastructure described as follows.
88 * Segmap and Devmap interfaces:
90 * The RSM kernel agent allows device memory to be directly accessed by user
91 * threads via memory mapping. In order to do so, the RSM kernel agent
92 * supports the devmap and segmap entry points.
94 * The segmap entry point(rsm_segmap) is responsible for setting up a memory
95 * mapping as requested by mmap. The devmap entry point(rsm_devmap) is
96 * responsible for exporting the device memory to the user applications.
97 * rsm_segmap calls RSMPI rsm_map to allocate device memory. Then the
98 * control is transfered to the devmap_setup call which calls rsm_devmap.
100 * rsm_devmap validates the user mapping to the device or kernel memory
101 * and passes the information to the system for setting up the mapping. The
102 * actual setting up of the mapping is done by devmap_devmem_setup(for
103 * device memory) or devmap_umem_setup(for kernel memory). Callbacks are
104 * registered for device context management via the devmap_devmem_setup
105 * or devmap_umem_setup calls. The callbacks are rsmmap_map, rsmmap_unmap,
106 * rsmmap_access, rsmmap_dup. The callbacks are called when a new mapping
107 * is created, a mapping is freed, a mapping is accessed or an existing
108 * mapping is duplicated respectively. These callbacks allow the RSM kernel
109 * agent to maintain state information associated with the mappings.
110 * The state information is mainly in the form of a cookie list for the import
111 * segment for which mapping has been done.
113 * Forced disconnect of import segments:
115 * When an exported segment is unpublished, the exporter sends a forced
116 * disconnect message to all its importers. The importer segments are
117 * unloaded and disconnected. This involves unloading the original
118 * mappings and remapping to a preallocated kernel trash page. This is
119 * done by devmap_umem_remap. The trash/dummy page is a kernel page,
120 * preallocated by the kernel agent during attach using ddi_umem_alloc with
121 * the DDI_UMEM_TRASH flag set. This avoids a core dump in the application
122 * due to unloading of the original mappings.
124 * Additionally every segment has a mapping generation number associated
125 * with it. This is an entry in the barrier generation page, created
126 * during attach time. This mapping generation number for the import
127 * segments is incremented on a force disconnect to notify the application
128 * of the force disconnect. On this notification, the application needs
129 * to reconnect the segment to establish a new legitimate mapping.
132 * Locks used in the kernel agent:
133 * -------------------------------
135 * The kernel agent uses a variety of mutexes and condition variables for
136 * mutual exclusion of the shared data structures and for synchronization
137 * between the various threads. Some of the locks are described as follows.
139 * Each resource structure, which represents either an export/import segment
140 * has a lock associated with it. The lock is the resource mutex, rsmrc_lock.
141 * This is used directly by RSMRC_LOCK and RSMRC_UNLOCK macros and in the
142 * rsmseglock_acquire and rsmseglock_release macros. An additional
143 * lock called the rsmsi_lock is used for the shared import data structure
144 * that is relevant for resources representing import segments. There is
145 * also a condition variable associated with the resource called s_cv. This
146 * is used to wait for events like the segment state change etc.
148 * The resource structures are allocated from a pool of resource structures,
149 * called rsm_resource. This pool is protected via a reader-writer lock,
150 * called rsmrc_lock.
152 * There are two separate hash tables, one for the export segments and
153 * one for the import segments. The export segments are inserted into the
154 * export segment hash table only after they have been published and the
155 * import segments are inserted in the import segments list only after they
156 * have successfully connected to an exported segment. These tables are
157 * protected via reader-writer locks.
159 * Debug Support in the kernel agent:
160 * ----------------------------------
162 * Debugging support in the kernel agent is provided by the following
163 * macros.
165 * DBG_PRINTF((category, level, message)) is a macro which logs a debug
166 * message to the kernel agents debug buffer, rsmka_dbg. This debug buffer
167 * can be viewed in kmdb as *rsmka_dbg/s. The message is logged based
168 * on the definition of the category and level. All messages that belong to
169 * the specified category(rsmdbg_category) and are of an equal or greater
170 * severity than the specified level(rsmdbg_level) are logged. The message
171 * is a string which uses the same formatting rules as the strings used in
172 * printf.
174 * The category defines which component of the kernel agent has logged this
175 * message. There are a number of categories that have been defined such as
176 * RSM_KERNEL_AGENT, RSM_OPS, RSM_IMPORT, RSM_EXPORT etc. A macro,
177 * DBG_ADDCATEGORY is used to add in another category to the currently
178 * specified category value so that the component using this new category
179 * can also effectively log debug messages. Thus, the category of a specific
180 * message is some combination of the available categories and we can define
181 * sub-categories if we want a finer level of granularity.
183 * The level defines the severity of the message. Different level values are
184 * defined, with RSM_ERR being the most severe and RSM_DEBUG_VERBOSE being
185 * the least severe(debug level is 0).
187 * DBG_DEFINE and DBG_DEFINE_STR are macros provided to declare a debug
188 * variable or a string respectively.
191 * NOTES:
193 * Special Fork and Exec Handling:
194 * -------------------------------
196 * The backing physical pages of an exported segment are always locked down.
197 * Thus, there are two cases in which a process having exported segments
198 * will cause a cpu to hang: (1) the process invokes exec; (2) a process
199 * forks and invokes exit before the duped file descriptors for the export
200 * segments are closed in the child process. The hang is caused because the
201 * address space release algorithm in Solaris VM subsystem is based on a
202 * non-blocking loop which does not terminate while segments are locked
203 * down. In addition to this, Solaris VM subsystem lacks a callback
204 * mechanism to the rsm kernel agent to allow unlocking these export
205 * segment pages.
207 * In order to circumvent this problem, the kernel agent does the following.
208 * The Solaris VM subsystem keeps memory segments in increasing order of
209 * virtual addressses. Thus a special page(special_exit_offset) is allocated
210 * by the kernel agent and is mmapped into the heap area of the process address
211 * space(the mmap is done by the RSMAPI library). During the mmap processing
212 * of this special page by the devmap infrastructure, a callback(the same
213 * devmap context management callbacks discussed above) is registered for an
214 * unmap.
216 * As discussed above, this page is processed by the Solaris address space
217 * release code before any of the exported segments pages(which are allocated
218 * from high memory). It is during this processing that the unmap callback gets
219 * called and this callback is responsible for force destroying the exported
220 * segments and thus eliminating the problem of locked pages.
222 * Flow-control:
223 * ------------
225 * A credit based flow control algorithm is used for messages whose
226 * processing cannot be done in the interrupt context because it might
227 * involve invoking rsmpi calls, or might take a long time to complete
228 * or might need to allocate resources. The algorithm operates on a per
229 * path basis. To send a message the pathend needs to have a credit and
230 * it consumes one for every message that is flow controlled. On the
231 * receiving pathend the message is put on a msgbuf_queue and a task is
232 * dispatched on the worker thread - recv_taskq where it is processed.
233 * After processing the message, the receiving pathend dequeues the message,
234 * and if it has processed > RSMIPC_LOTSFREE_MSGBUFS messages sends
235 * credits to the sender pathend.
237 * RSM_DRTEST:
238 * -----------
240 * This is used to enable the DR testing using a test driver on test
241 * platforms which do not supported DR.
245 #include <sys/types.h>
246 #include <sys/param.h>
247 #include <sys/user.h>
248 #include <sys/buf.h>
249 #include <sys/systm.h>
250 #include <sys/cred.h>
251 #include <sys/vm.h>
252 #include <sys/uio.h>
253 #include <vm/seg.h>
254 #include <vm/page.h>
255 #include <sys/stat.h>
257 #include <sys/time.h>
258 #include <sys/errno.h>
260 #include <sys/file.h>
261 #include <sys/uio.h>
262 #include <sys/proc.h>
263 #include <sys/mman.h>
264 #include <sys/open.h>
265 #include <sys/atomic.h>
266 #include <sys/mem_config.h>
269 #include <sys/ddi.h>
270 #include <sys/devops.h>
271 #include <sys/ddidevmap.h>
272 #include <sys/sunddi.h>
273 #include <sys/esunddi.h>
274 #include <sys/ddi_impldefs.h>
276 #include <sys/kmem.h>
277 #include <sys/conf.h>
278 #include <sys/devops.h>
279 #include <sys/ddi_impldefs.h>
281 #include <sys/modctl.h>
283 #include <sys/policy.h>
284 #include <sys/types.h>
285 #include <sys/conf.h>
286 #include <sys/param.h>
288 #include <sys/taskq.h>
290 #include <sys/rsm/rsm_common.h>
291 #include <sys/rsm/rsmapi_common.h>
292 #include <sys/rsm/rsm.h>
293 #include <rsm_in.h>
294 #include <sys/rsm/rsmka_path_int.h>
295 #include <sys/rsm/rsmpi.h>
297 #include <sys/modctl.h>
298 #include <sys/debug.h>
300 #include <sys/tuneable.h>
302 #ifdef RSM_DRTEST
303 extern int rsm_kphysm_setup_func_register(kphysm_setup_vector_t *vec,
304 void *arg);
305 extern void rsm_kphysm_setup_func_unregister(kphysm_setup_vector_t *vec,
306 void *arg);
307 #endif
309 extern void dbg_printf(int category, int level, char *fmt, ...);
310 extern void rsmka_pathmanager_init();
311 extern void rsmka_pathmanager_cleanup();
312 extern void rele_sendq_token(sendq_token_t *);
313 extern rsm_addr_t get_remote_hwaddr(adapter_t *, rsm_node_id_t);
314 extern rsm_node_id_t get_remote_nodeid(adapter_t *, rsm_addr_t);
315 extern int rsmka_topology_ioctl(caddr_t, int, int);
317 extern pri_t maxclsyspri;
318 extern work_queue_t work_queue;
319 extern kmutex_t ipc_info_lock;
320 extern kmutex_t ipc_info_cvlock;
321 extern kcondvar_t ipc_info_cv;
322 extern kmutex_t path_hold_cvlock;
323 extern kcondvar_t path_hold_cv;
325 extern kmutex_t rsmka_buf_lock;
327 extern path_t *rsm_find_path(char *, int, rsm_addr_t);
328 extern adapter_t *rsmka_lookup_adapter(char *, int);
329 extern sendq_token_t *rsmka_get_sendq_token(rsm_node_id_t, sendq_token_t *);
330 extern boolean_t rsmka_do_path_active(path_t *, int);
331 extern boolean_t rsmka_check_node_alive(rsm_node_id_t);
332 extern void rsmka_release_adapter(adapter_t *);
333 extern void rsmka_enqueue_msgbuf(path_t *path, void *data);
334 extern void rsmka_dequeue_msgbuf(path_t *path);
335 extern msgbuf_elem_t *rsmka_gethead_msgbuf(path_t *path);
336 /* lint -w2 */
338 static int rsm_open(dev_t *, int, int, cred_t *);
339 static int rsm_close(dev_t, int, int, cred_t *);
340 static int rsm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
341 cred_t *credp, int *rvalp);
342 static int rsm_devmap(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
343 uint_t);
344 static int rsm_segmap(dev_t, off_t, struct as *, caddr_t *, off_t, uint_t,
345 uint_t, uint_t, cred_t *);
346 static int rsm_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
347 struct pollhead **phpp);
349 static int rsm_info(dev_info_t *, ddi_info_cmd_t, void *, void **);
350 static int rsm_attach(dev_info_t *, ddi_attach_cmd_t);
351 static int rsm_detach(dev_info_t *, ddi_detach_cmd_t);
353 static int rsmipc_send(rsm_node_id_t, rsmipc_request_t *, rsmipc_reply_t *);
354 static void rsm_force_unload(rsm_node_id_t, rsm_memseg_id_t, boolean_t);
355 static void rsm_send_importer_disconnects(rsm_memseg_id_t, rsm_node_id_t);
356 static void rsm_send_republish(rsm_memseg_id_t, rsmapi_access_entry_t *, int,
357 rsm_permission_t);
358 static void rsm_export_force_destroy(ddi_umem_cookie_t *);
359 static void rsmacl_free(rsmapi_access_entry_t *, int);
360 static void rsmpiacl_free(rsm_access_entry_t *, int);
362 static int rsm_inc_pgcnt(pgcnt_t);
363 static void rsm_dec_pgcnt(pgcnt_t);
364 static void rsm_free_mapinfo(rsm_mapinfo_t *mapinfop);
365 static rsm_mapinfo_t *rsm_get_mapinfo(rsmseg_t *, off_t, size_t, off_t *,
366 size_t *);
367 static void exporter_quiesce();
368 static void rsmseg_suspend(rsmseg_t *, int *);
369 static void rsmsegshare_suspend(rsmseg_t *);
370 static int rsmseg_resume(rsmseg_t *, void **);
371 static int rsmsegshare_resume(rsmseg_t *);
373 static struct cb_ops rsm_cb_ops = {
374 rsm_open, /* open */
375 rsm_close, /* close */
376 nodev, /* strategy */
377 nodev, /* print */
378 nodev, /* dump */
379 nodev, /* read */
380 nodev, /* write */
381 rsm_ioctl, /* ioctl */
382 rsm_devmap, /* devmap */
383 NULL, /* mmap */
384 rsm_segmap, /* segmap */
385 rsm_chpoll, /* poll */
386 ddi_prop_op, /* cb_prop_op */
387 0, /* streamtab */
388 D_NEW|D_MP|D_DEVMAP, /* Driver compatibility flag */
394 static struct dev_ops rsm_ops = {
395 DEVO_REV, /* devo_rev, */
396 0, /* refcnt */
397 rsm_info, /* get_dev_info */
398 nulldev, /* identify */
399 nulldev, /* probe */
400 rsm_attach, /* attach */
401 rsm_detach, /* detach */
402 nodev, /* reset */
403 &rsm_cb_ops, /* driver operations */
404 NULL, /* bus operations */
406 ddi_quiesce_not_needed, /* quiesce */
410 * Module linkage information for the kernel.
413 static struct modldrv modldrv = {
414 &mod_driverops, /* Type of module. This one is a pseudo driver */
415 "Remote Shared Memory Driver",
416 &rsm_ops, /* driver ops */
419 static struct modlinkage modlinkage = {
420 MODREV_1,
421 (void *)&modldrv,
427 static void rsm_dr_callback_post_add(void *arg, pgcnt_t delta);
428 static int rsm_dr_callback_pre_del(void *arg, pgcnt_t delta);
429 static void rsm_dr_callback_post_del(void *arg, pgcnt_t delta, int cancelled);
431 static kphysm_setup_vector_t rsm_dr_callback_vec = {
432 KPHYSM_SETUP_VECTOR_VERSION,
433 rsm_dr_callback_post_add,
434 rsm_dr_callback_pre_del,
435 rsm_dr_callback_post_del
438 /* This flag can be changed to 0 to help with PIT testing */
439 int rsmka_modunloadok = 1;
440 int no_reply_cnt = 0;
442 uint64_t rsm_ctrlmsg_errcnt = 0;
443 uint64_t rsm_ipcsend_errcnt = 0;
445 #define MAX_NODES 64
447 static struct rsm_driver_data rsm_drv_data;
448 static struct rsmresource_table rsm_resource;
450 static void rsmresource_insert(minor_t, rsmresource_t *, rsm_resource_type_t);
451 static void rsmresource_destroy(void);
452 static int rsmresource_alloc(minor_t *);
453 static rsmresource_t *rsmresource_free(minor_t rnum);
454 static int rsm_closeconnection(rsmseg_t *seg, void **cookie);
455 static int rsm_unpublish(rsmseg_t *seg, int mode);
456 static int rsm_unbind(rsmseg_t *seg);
457 static uint_t rsmhash(rsm_memseg_id_t key);
458 static void rsmhash_alloc(rsmhash_table_t *rhash, int size);
459 static void rsmhash_free(rsmhash_table_t *rhash, int size);
460 static void *rsmhash_getbkt(rsmhash_table_t *rhash, uint_t hashval);
461 static void **rsmhash_bktaddr(rsmhash_table_t *rhash, uint_t hashval);
462 static int rsm_send_notimporting(rsm_node_id_t dest, rsm_memseg_id_t segid,
463 void *cookie);
464 int rsm_disconnect(rsmseg_t *seg);
465 void rsmseg_unload(rsmseg_t *);
466 void rsm_suspend_complete(rsm_node_id_t src_node, int flag);
468 rsm_intr_hand_ret_t rsm_srv_func(rsm_controller_object_t *chd,
469 rsm_intr_q_op_t opcode, rsm_addr_t src,
470 void *data, size_t size, rsm_intr_hand_arg_t arg);
472 static void rsm_intr_callback(void *, rsm_addr_t, rsm_intr_hand_arg_t);
474 rsm_node_id_t my_nodeid;
476 /* cookie, va, offsets and length for the barrier */
477 static rsm_gnum_t *bar_va;
478 static ddi_umem_cookie_t bar_cookie;
479 static off_t barrier_offset;
480 static size_t barrier_size;
481 static int max_segs;
483 /* cookie for the trash memory */
484 static ddi_umem_cookie_t remap_cookie;
486 static rsm_memseg_id_t rsm_nextavail_segmentid;
488 extern taskq_t *work_taskq;
489 extern char *taskq_name;
491 static dev_info_t *rsm_dip; /* private copy of devinfo pointer */
493 static rsmhash_table_t rsm_export_segs; /* list of exported segs */
494 rsmhash_table_t rsm_import_segs; /* list of imported segs */
495 static rsmhash_table_t rsm_event_queues; /* list of event queues */
497 static rsm_ipc_t rsm_ipc; /* ipc info */
499 /* list of nodes to which RSMIPC_MSG_SUSPEND has been sent */
500 static list_head_t rsm_suspend_list;
502 /* list of descriptors for remote importers */
503 static importers_table_t importer_list;
505 kmutex_t rsm_suspend_cvlock;
506 kcondvar_t rsm_suspend_cv;
508 static kmutex_t rsm_lock;
510 adapter_t loopback_adapter;
511 rsm_controller_attr_t loopback_attr;
513 int rsmipc_send_controlmsg(path_t *path, int msgtype);
515 void rsmka_init_loopback();
517 int rsmka_null_seg_create(
518 rsm_controller_handle_t,
519 rsm_memseg_export_handle_t *,
520 size_t,
521 uint_t,
522 rsm_memory_local_t *,
523 rsm_resource_callback_t,
524 rsm_resource_callback_arg_t);
526 int rsmka_null_seg_destroy(
527 rsm_memseg_export_handle_t);
529 int rsmka_null_bind(
530 rsm_memseg_export_handle_t,
531 off_t,
532 rsm_memory_local_t *,
533 rsm_resource_callback_t,
534 rsm_resource_callback_arg_t);
536 int rsmka_null_unbind(
537 rsm_memseg_export_handle_t,
538 off_t,
539 size_t);
541 int rsmka_null_rebind(
542 rsm_memseg_export_handle_t,
543 off_t,
544 rsm_memory_local_t *,
545 rsm_resource_callback_t,
546 rsm_resource_callback_arg_t);
548 int rsmka_null_publish(
549 rsm_memseg_export_handle_t,
550 rsm_access_entry_t [],
551 uint_t,
552 rsm_memseg_id_t,
553 rsm_resource_callback_t,
554 rsm_resource_callback_arg_t);
557 int rsmka_null_republish(
558 rsm_memseg_export_handle_t,
559 rsm_access_entry_t [],
560 uint_t,
561 rsm_resource_callback_t,
562 rsm_resource_callback_arg_t);
564 int rsmka_null_unpublish(
565 rsm_memseg_export_handle_t);
567 rsm_ops_t null_rsmpi_ops;
570 * data and locks to keep track of total amount of exported memory
572 static pgcnt_t rsm_pgcnt;
573 static pgcnt_t rsm_pgcnt_max; /* max allowed */
574 static kmutex_t rsm_pgcnt_lock;
576 static int rsm_enable_dr;
578 static char loopback_str[] = "loopback";
580 int rsm_hash_size;
583 * The locking model is as follows:
585 * Local operations:
586 * find resource - grab reader lock on resouce list
587 * insert rc - grab writer lock
588 * delete rc - grab writer lock and resource mutex
589 * read/write - no lock
591 * Remote invocations:
592 * find resource - grab read lock and resource mutex
594 * State:
595 * resource state - grab resource mutex
599 _init(void)
601 int e;
603 e = mod_install(&modlinkage);
604 if (e != 0) {
605 return (e);
608 mutex_init(&rsm_lock, NULL, MUTEX_DRIVER, NULL);
610 mutex_init(&rsmka_buf_lock, NULL, MUTEX_DEFAULT, NULL);
613 rw_init(&rsm_resource.rsmrc_lock, NULL, RW_DRIVER, NULL);
615 rsm_hash_size = RSM_HASHSZ;
617 rw_init(&rsm_export_segs.rsmhash_rw, NULL, RW_DRIVER, NULL);
619 rw_init(&rsm_import_segs.rsmhash_rw, NULL, RW_DRIVER, NULL);
621 mutex_init(&importer_list.lock, NULL, MUTEX_DRIVER, NULL);
623 mutex_init(&rsm_ipc.lock, NULL, MUTEX_DRIVER, NULL);
624 cv_init(&rsm_ipc.cv, NULL, CV_DRIVER, 0);
626 mutex_init(&rsm_suspend_cvlock, NULL, MUTEX_DRIVER, NULL);
627 cv_init(&rsm_suspend_cv, NULL, CV_DRIVER, 0);
629 mutex_init(&rsm_drv_data.drv_lock, NULL, MUTEX_DRIVER, NULL);
630 cv_init(&rsm_drv_data.drv_cv, NULL, CV_DRIVER, 0);
632 rsm_ipc.count = RSMIPC_SZ;
633 rsm_ipc.wanted = 0;
634 rsm_ipc.sequence = 0;
636 (void) mutex_init(&rsm_pgcnt_lock, NULL, MUTEX_DRIVER, NULL);
638 for (e = 0; e < RSMIPC_SZ; e++) {
639 rsmipc_slot_t *slot = &rsm_ipc.slots[e];
641 RSMIPC_SET(slot, RSMIPC_FREE);
642 mutex_init(&slot->rsmipc_lock, NULL, MUTEX_DRIVER, NULL);
643 cv_init(&slot->rsmipc_cv, NULL, CV_DRIVER, 0);
647 * Initialize the suspend message list
649 rsm_suspend_list.list_head = NULL;
650 mutex_init(&rsm_suspend_list.list_lock, NULL, MUTEX_DRIVER, NULL);
653 * It is assumed here that configuration data is available
654 * during system boot since _init may be called at that time.
657 rsmka_pathmanager_init();
659 DBG_PRINTF((RSM_KERNEL_AGENT, RSM_DEBUG_VERBOSE,
660 "rsm: _init done\n"));
662 return (DDI_SUCCESS);
667 _info(struct modinfo *modinfop)
670 return (mod_info(&modlinkage, modinfop));
674 _fini(void)
676 int e;
678 DBG_PRINTF((RSM_KERNEL_AGENT, RSM_DEBUG_VERBOSE,
679 "rsm: _fini enter\n"));
682 * The rsmka_modunloadok flag is simply used to help with
683 * the PIT testing. Make this flag 0 to disallow modunload.
685 if (rsmka_modunloadok == 0)
686 return (EBUSY);
688 /* rsm_detach will be called as a result of mod_remove */
689 e = mod_remove(&modlinkage);
690 if (e) {
691 DBG_PRINTF((RSM_KERNEL_AGENT, RSM_ERR,
692 "Unable to fini RSM %x\n", e));
693 return (e);
696 rsmka_pathmanager_cleanup();
698 rw_destroy(&rsm_resource.rsmrc_lock);
700 rw_destroy(&rsm_export_segs.rsmhash_rw);
701 rw_destroy(&rsm_import_segs.rsmhash_rw);
702 rw_destroy(&rsm_event_queues.rsmhash_rw);
704 mutex_destroy(&importer_list.lock);
706 mutex_destroy(&rsm_ipc.lock);
707 cv_destroy(&rsm_ipc.cv);
709 (void) mutex_destroy(&rsm_suspend_list.list_lock);
711 (void) mutex_destroy(&rsm_pgcnt_lock);
713 DBG_PRINTF((RSM_KERNEL_AGENT, RSM_DEBUG_VERBOSE, "_fini done\n"));
715 return (DDI_SUCCESS);
719 /*ARGSUSED1*/
720 static int
721 rsm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
723 minor_t rnum;
724 int percent;
725 int ret;
726 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_DDI);
728 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_attach enter\n"));
730 switch (cmd) {
731 case DDI_ATTACH:
732 break;
733 case DDI_RESUME:
734 default:
735 DBG_PRINTF((category, RSM_ERR,
736 "rsm:rsm_attach - cmd not supported\n"));
737 return (DDI_FAILURE);
740 if (rsm_dip != NULL) {
741 DBG_PRINTF((category, RSM_ERR,
742 "rsm:rsm_attach - supports only "
743 "one instance\n"));
744 return (DDI_FAILURE);
747 rsm_enable_dr = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
748 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
749 "enable-dynamic-reconfiguration", 1);
751 mutex_enter(&rsm_drv_data.drv_lock);
752 rsm_drv_data.drv_state = RSM_DRV_REG_PROCESSING;
753 mutex_exit(&rsm_drv_data.drv_lock);
755 if (rsm_enable_dr) {
756 #ifdef RSM_DRTEST
757 ret = rsm_kphysm_setup_func_register(&rsm_dr_callback_vec,
758 NULL);
759 #else
760 ret = kphysm_setup_func_register(&rsm_dr_callback_vec,
761 NULL);
762 #endif
763 if (ret != 0) {
764 mutex_exit(&rsm_drv_data.drv_lock);
765 cmn_err(CE_CONT, "rsm:rsm_attach - Dynamic "
766 "reconfiguration setup failed\n");
767 return (DDI_FAILURE);
771 mutex_enter(&rsm_drv_data.drv_lock);
772 ASSERT(rsm_drv_data.drv_state == RSM_DRV_REG_PROCESSING);
773 rsm_drv_data.drv_state = RSM_DRV_OK;
774 cv_broadcast(&rsm_drv_data.drv_cv);
775 mutex_exit(&rsm_drv_data.drv_lock);
778 * page_list_read_lock();
779 * xx_setup();
780 * page_list_read_unlock();
783 rsm_hash_size = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
784 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
785 "segment-hashtable-size", RSM_HASHSZ);
786 if (rsm_hash_size == 0) {
787 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
788 "rsm: segment-hashtable-size in rsm.conf "
789 "must be greater than 0, defaulting to 128\n"));
790 rsm_hash_size = RSM_HASHSZ;
793 DBG_PRINTF((category, RSM_DEBUG, "rsm_attach rsm_hash_size: %d\n",
794 rsm_hash_size));
796 rsm_pgcnt = 0;
798 percent = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
799 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
800 "max-exported-memory", 0);
801 if (percent < 0) {
802 DBG_PRINTF((category, RSM_ERR,
803 "rsm:rsm_attach not enough memory available to "
804 "export, or max-exported-memory set incorrectly.\n"));
805 return (DDI_FAILURE);
807 /* 0 indicates no fixed upper limit. maxmem is the max */
808 /* available pageable physical mem */
809 rsm_pgcnt_max = (percent*maxmem)/100;
811 if (rsm_pgcnt_max > 0) {
812 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
813 "rsm: Available physical memory = %lu pages, "
814 "Max exportable memory = %lu pages",
815 maxmem, rsm_pgcnt_max));
819 * Create minor number
821 if (rsmresource_alloc(&rnum) != RSM_SUCCESS) {
822 DBG_PRINTF((category, RSM_ERR,
823 "rsm: rsm_attach - Unable to get "
824 "minor number\n"));
825 return (DDI_FAILURE);
828 ASSERT(rnum == RSM_DRIVER_MINOR);
830 if (ddi_create_minor_node(devi, DRIVER_NAME, S_IFCHR,
831 rnum, DDI_PSEUDO, 0) == DDI_FAILURE) {
832 DBG_PRINTF((category, RSM_ERR,
833 "rsm: rsm_attach - unable to allocate "
834 "minor #\n"));
835 return (DDI_FAILURE);
838 rsm_dip = devi;
840 * Allocate the hashtables
842 rsmhash_alloc(&rsm_export_segs, rsm_hash_size);
843 rsmhash_alloc(&rsm_import_segs, rsm_hash_size);
845 importer_list.bucket = (importing_token_t **)
846 kmem_zalloc(rsm_hash_size * sizeof (importing_token_t *), KM_SLEEP);
849 * Allocate a resource struct
852 rsmresource_t *p;
854 p = (rsmresource_t *)kmem_zalloc(sizeof (*p), KM_SLEEP);
856 mutex_init(&p->rsmrc_lock, NULL, MUTEX_DRIVER, NULL);
858 rsmresource_insert(rnum, p, RSM_RESOURCE_BAR);
862 * Based on the rsm.conf property max-segments, determine the maximum
863 * number of segments that can be exported/imported. This is then used
864 * to determine the size for barrier failure pages.
867 /* First get the max number of segments from the rsm.conf file */
868 max_segs = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
869 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
870 "max-segments", 0);
871 if (max_segs == 0) {
872 /* Use default number of segments */
873 max_segs = RSM_MAX_NUM_SEG;
877 * Based on the max number of segments allowed, determine the barrier
878 * page size. add 1 to max_segs since the barrier page itself uses
879 * a slot
881 barrier_size = roundup((max_segs + 1) * sizeof (rsm_gnum_t),
882 PAGESIZE);
885 * allocation of the barrier failure page
887 bar_va = (rsm_gnum_t *)ddi_umem_alloc(barrier_size,
888 DDI_UMEM_SLEEP, &bar_cookie);
891 * Set the barrier_offset
893 barrier_offset = 0;
896 * Allocate a trash memory and get a cookie for it. This will be used
897 * when remapping segments during force disconnects. Allocate the
898 * trash memory with a large size which is page aligned.
900 (void) ddi_umem_alloc((size_t)TRASHSIZE,
901 DDI_UMEM_TRASH, &remap_cookie);
903 /* initialize user segment id allocation variable */
904 rsm_nextavail_segmentid = (rsm_memseg_id_t)RSM_USER_APP_ID_BASE;
907 * initialize the null_rsmpi_ops vector and the loopback adapter
909 rsmka_init_loopback();
912 ddi_report_dev(devi);
914 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_attach done\n"));
916 return (DDI_SUCCESS);
920 * The call to mod_remove in the _fine routine will cause the system
921 * to call rsm_detach
923 /*ARGSUSED*/
924 static int
925 rsm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
927 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_DDI);
929 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_detach enter\n"));
931 switch (cmd) {
932 case DDI_DETACH:
933 break;
934 default:
935 DBG_PRINTF((category, RSM_ERR,
936 "rsm:rsm_detach - cmd %x not supported\n",
937 cmd));
938 return (DDI_FAILURE);
941 mutex_enter(&rsm_drv_data.drv_lock);
942 while (rsm_drv_data.drv_state != RSM_DRV_OK)
943 cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
944 rsm_drv_data.drv_state = RSM_DRV_UNREG_PROCESSING;
945 mutex_exit(&rsm_drv_data.drv_lock);
948 * Unregister the DR callback functions
950 if (rsm_enable_dr) {
951 #ifdef RSM_DRTEST
952 rsm_kphysm_setup_func_unregister(&rsm_dr_callback_vec,
953 NULL);
954 #else
955 kphysm_setup_func_unregister(&rsm_dr_callback_vec,
956 NULL);
957 #endif
960 mutex_enter(&rsm_drv_data.drv_lock);
961 ASSERT(rsm_drv_data.drv_state == RSM_DRV_UNREG_PROCESSING);
962 rsm_drv_data.drv_state = RSM_DRV_NEW;
963 mutex_exit(&rsm_drv_data.drv_lock);
965 ASSERT(rsm_suspend_list.list_head == NULL);
968 * Release all resources, seglist, controller, ...
971 /* remove intersend queues */
972 /* remove registered services */
975 ddi_remove_minor_node(dip, DRIVER_NAME);
976 rsm_dip = NULL;
979 * Free minor zero resource
982 rsmresource_t *p;
984 p = rsmresource_free(RSM_DRIVER_MINOR);
985 if (p) {
986 mutex_destroy(&p->rsmrc_lock);
987 kmem_free((void *)p, sizeof (*p));
992 * Free resource table
995 rsmresource_destroy();
998 * Free the hash tables
1000 rsmhash_free(&rsm_export_segs, rsm_hash_size);
1001 rsmhash_free(&rsm_import_segs, rsm_hash_size);
1003 kmem_free((void *)importer_list.bucket,
1004 rsm_hash_size * sizeof (importing_token_t *));
1005 importer_list.bucket = NULL;
1008 /* free barrier page */
1009 if (bar_cookie != NULL) {
1010 ddi_umem_free(bar_cookie);
1012 bar_va = NULL;
1013 bar_cookie = NULL;
1016 * Free the memory allocated for the trash
1018 if (remap_cookie != NULL) {
1019 ddi_umem_free(remap_cookie);
1021 remap_cookie = NULL;
1023 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_detach done\n"));
1025 return (DDI_SUCCESS);
1028 /*ARGSUSED*/
1029 static int
1030 rsm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1032 register int error;
1033 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_DDI);
1035 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_info enter\n"));
1037 switch (infocmd) {
1038 case DDI_INFO_DEVT2DEVINFO:
1039 if (rsm_dip == NULL)
1040 error = DDI_FAILURE;
1041 else {
1042 *result = (void *)rsm_dip;
1043 error = DDI_SUCCESS;
1045 break;
1046 case DDI_INFO_DEVT2INSTANCE:
1047 *result = NULL;
1048 error = DDI_SUCCESS;
1049 break;
1050 default:
1051 error = DDI_FAILURE;
1054 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_info done\n"));
1055 return (error);
1058 adapter_t *
1059 rsm_getadapter(rsm_ioctlmsg_t *msg, int mode)
1061 adapter_t *adapter;
1062 char adapter_devname[MAXNAMELEN];
1063 int instance;
1064 DBG_DEFINE(category,
1065 RSM_KERNEL_AGENT | RSM_IMPORT | RSM_EXPORT | RSM_IOCTL);
1067 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_getadapter enter\n"));
1069 instance = msg->cnum;
1071 if ((msg->cname_len <= 0) || (msg->cname_len > MAXNAMELEN)) {
1072 return (NULL);
1075 if (ddi_copyin(msg->cname, adapter_devname, msg->cname_len, mode))
1076 return (NULL);
1078 if (strcmp(adapter_devname, "loopback") == 0)
1079 return (&loopback_adapter);
1081 adapter = rsmka_lookup_adapter(adapter_devname, instance);
1083 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_getadapter done\n"));
1085 return (adapter);
1090 * *********************** Resource Number Management ********************
1091 * All resources are stored in a simple hash table. The table is an array
1092 * of pointers to resource blks. Each blk contains:
1093 * base - base number of this blk
1094 * used - number of used slots in this blk.
1095 * blks - array of pointers to resource items.
1096 * An entry in a resource blk is empty if it's NULL.
1098 * We start with no resource array. Each time we run out of slots, we
1099 * reallocate a new larger array and copy the pointer to the new array and
1100 * a new resource blk is allocated and added to the hash table.
1102 * The resource control block contains:
1103 * root - array of pointer of resource blks
1104 * sz - current size of array.
1105 * len - last valid entry in array.
1107 * A search operation based on a resource number is as follows:
1108 * index = rnum / RESOURCE_BLKSZ;
1109 * ASSERT(index < resource_block.len);
1110 * ASSERT(index < resource_block.sz);
1111 * offset = rnum % RESOURCE_BLKSZ;
1112 * ASSERT(offset >= resource_block.root[index]->base);
1113 * ASSERT(offset < resource_block.root[index]->base + RESOURCE_BLKSZ);
1114 * return resource_block.root[index]->blks[offset];
1116 * A resource blk is freed with its used count reachs zero.
1118 static int
1119 rsmresource_alloc(minor_t *rnum)
1122 /* search for available resource slot */
1123 int i, j, empty = -1;
1124 rsmresource_blk_t *blk;
1126 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1127 "rsmresource_alloc enter\n"));
1129 rw_enter(&rsm_resource.rsmrc_lock, RW_WRITER);
1131 /* Try to find an empty slot */
1132 for (i = 0; i < rsm_resource.rsmrc_len; i++) {
1133 blk = rsm_resource.rsmrc_root[i];
1134 if (blk != NULL && blk->rsmrcblk_avail > 0) {
1135 /* found an empty slot in this blk */
1136 for (j = 0; j < RSMRC_BLKSZ; j++) {
1137 if (blk->rsmrcblk_blks[j] == NULL) {
1138 *rnum = (minor_t)
1139 (j + (i * RSMRC_BLKSZ));
1141 * obey gen page limits
1143 if (*rnum >= max_segs + 1) {
1144 if (empty < 0) {
1145 rw_exit(&rsm_resource.
1146 rsmrc_lock);
1147 DBG_PRINTF((
1148 RSM_KERNEL_ALL,
1149 RSM_ERR,
1150 "rsmresource"
1151 "_alloc failed:"
1152 "not enough res"
1153 "%d\n", *rnum));
1154 return (RSMERR_INSUFFICIENT_RESOURCES);
1155 } else {
1156 /* use empty slot */
1157 break;
1162 blk->rsmrcblk_blks[j] = RSMRC_RESERVED;
1163 blk->rsmrcblk_avail--;
1164 rw_exit(&rsm_resource.rsmrc_lock);
1165 DBG_PRINTF((RSM_KERNEL_ALL,
1166 RSM_DEBUG_VERBOSE,
1167 "rsmresource_alloc done\n"));
1168 return (RSM_SUCCESS);
1171 } else if (blk == NULL && empty < 0) {
1172 /* remember first empty slot */
1173 empty = i;
1177 /* Couldn't find anything, allocate a new blk */
1179 * Do we need to reallocate the root array
1181 if (empty < 0) {
1182 if (rsm_resource.rsmrc_len == rsm_resource.rsmrc_sz) {
1184 * Allocate new array and copy current stuff into it
1186 rsmresource_blk_t **p;
1187 uint_t newsz = (uint_t)rsm_resource.rsmrc_sz +
1188 RSMRC_BLKSZ;
1190 * Don't allocate more that max valid rnum
1192 if (rsm_resource.rsmrc_len*RSMRC_BLKSZ >=
1193 max_segs + 1) {
1194 rw_exit(&rsm_resource.rsmrc_lock);
1195 return (RSMERR_INSUFFICIENT_RESOURCES);
1198 p = (rsmresource_blk_t **)kmem_zalloc(
1199 newsz * sizeof (*p),
1200 KM_SLEEP);
1202 if (rsm_resource.rsmrc_root) {
1203 uint_t oldsz;
1205 oldsz = (uint_t)(rsm_resource.rsmrc_sz *
1206 (int)sizeof (*p));
1209 * Copy old data into new space and
1210 * free old stuff
1212 bcopy(rsm_resource.rsmrc_root, p, oldsz);
1213 kmem_free(rsm_resource.rsmrc_root, oldsz);
1216 rsm_resource.rsmrc_root = p;
1217 rsm_resource.rsmrc_sz = (int)newsz;
1220 empty = rsm_resource.rsmrc_len;
1221 rsm_resource.rsmrc_len++;
1225 * Allocate a new blk
1227 blk = (rsmresource_blk_t *)kmem_zalloc(sizeof (*blk), KM_SLEEP);
1228 ASSERT(rsm_resource.rsmrc_root[empty] == NULL);
1229 rsm_resource.rsmrc_root[empty] = blk;
1230 blk->rsmrcblk_avail = RSMRC_BLKSZ - 1;
1233 * Allocate slot
1236 *rnum = (minor_t)(empty * RSMRC_BLKSZ);
1239 * watch out not to exceed bounds of barrier page
1241 if (*rnum >= max_segs + 1) {
1242 rw_exit(&rsm_resource.rsmrc_lock);
1243 DBG_PRINTF((RSM_KERNEL_ALL, RSM_ERR,
1244 "rsmresource_alloc failed %d\n", *rnum));
1246 return (RSMERR_INSUFFICIENT_RESOURCES);
1248 blk->rsmrcblk_blks[0] = RSMRC_RESERVED;
1251 rw_exit(&rsm_resource.rsmrc_lock);
1253 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1254 "rsmresource_alloc done\n"));
1256 return (RSM_SUCCESS);
1259 static rsmresource_t *
1260 rsmresource_free(minor_t rnum)
1263 /* search for available resource slot */
1264 int i, j;
1265 rsmresource_blk_t *blk;
1266 rsmresource_t *p;
1268 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1269 "rsmresource_free enter\n"));
1271 i = (int)(rnum / RSMRC_BLKSZ);
1272 j = (int)(rnum % RSMRC_BLKSZ);
1274 if (i >= rsm_resource.rsmrc_len) {
1275 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1276 "rsmresource_free done\n"));
1277 return (NULL);
1280 rw_enter(&rsm_resource.rsmrc_lock, RW_WRITER);
1282 ASSERT(rsm_resource.rsmrc_root);
1283 ASSERT(i < rsm_resource.rsmrc_len);
1284 ASSERT(i < rsm_resource.rsmrc_sz);
1285 blk = rsm_resource.rsmrc_root[i];
1286 if (blk == NULL) {
1287 rw_exit(&rsm_resource.rsmrc_lock);
1288 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1289 "rsmresource_free done\n"));
1290 return (NULL);
1293 ASSERT(blk->rsmrcblk_blks[j]); /* reserved or full */
1295 p = blk->rsmrcblk_blks[j];
1296 if (p == RSMRC_RESERVED) {
1297 p = NULL;
1300 blk->rsmrcblk_blks[j] = NULL;
1301 blk->rsmrcblk_avail++;
1302 if (blk->rsmrcblk_avail == RSMRC_BLKSZ) {
1303 /* free this blk */
1304 kmem_free(blk, sizeof (*blk));
1305 rsm_resource.rsmrc_root[i] = NULL;
1308 rw_exit(&rsm_resource.rsmrc_lock);
1310 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1311 "rsmresource_free done\n"));
1313 return (p);
1316 static rsmresource_t *
1317 rsmresource_lookup(minor_t rnum, int lock)
1319 int i, j;
1320 rsmresource_blk_t *blk;
1321 rsmresource_t *p;
1323 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1324 "rsmresource_lookup enter\n"));
1326 /* Find resource and lock it in READER mode */
1327 /* search for available resource slot */
1329 i = (int)(rnum / RSMRC_BLKSZ);
1330 j = (int)(rnum % RSMRC_BLKSZ);
1332 if (i >= rsm_resource.rsmrc_len) {
1333 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1334 "rsmresource_lookup done\n"));
1335 return (NULL);
1338 rw_enter(&rsm_resource.rsmrc_lock, RW_READER);
1340 blk = rsm_resource.rsmrc_root[i];
1341 if (blk != NULL) {
1342 ASSERT(i < rsm_resource.rsmrc_len);
1343 ASSERT(i < rsm_resource.rsmrc_sz);
1345 p = blk->rsmrcblk_blks[j];
1346 if (lock == RSM_LOCK) {
1347 if (p != RSMRC_RESERVED) {
1348 mutex_enter(&p->rsmrc_lock);
1349 } else {
1350 p = NULL;
1353 } else {
1354 p = NULL;
1356 rw_exit(&rsm_resource.rsmrc_lock);
1358 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1359 "rsmresource_lookup done\n"));
1361 return (p);
1364 static void
1365 rsmresource_insert(minor_t rnum, rsmresource_t *p, rsm_resource_type_t type)
1367 /* Find resource and lock it in READER mode */
1368 /* Caller can upgrade if need be */
1369 /* search for available resource slot */
1370 int i, j;
1371 rsmresource_blk_t *blk;
1373 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1374 "rsmresource_insert enter\n"));
1376 i = (int)(rnum / RSMRC_BLKSZ);
1377 j = (int)(rnum % RSMRC_BLKSZ);
1379 p->rsmrc_type = type;
1380 p->rsmrc_num = rnum;
1382 rw_enter(&rsm_resource.rsmrc_lock, RW_READER);
1384 ASSERT(rsm_resource.rsmrc_root);
1385 ASSERT(i < rsm_resource.rsmrc_len);
1386 ASSERT(i < rsm_resource.rsmrc_sz);
1388 blk = rsm_resource.rsmrc_root[i];
1389 ASSERT(blk);
1391 ASSERT(blk->rsmrcblk_blks[j] == RSMRC_RESERVED);
1393 blk->rsmrcblk_blks[j] = p;
1395 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1396 "rsmresource_insert done\n"));
1398 rw_exit(&rsm_resource.rsmrc_lock);
1401 static void
1402 rsmresource_destroy()
1404 int i, j;
1406 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1407 "rsmresource_destroy enter\n"));
1409 rw_enter(&rsm_resource.rsmrc_lock, RW_WRITER);
1411 for (i = 0; i < rsm_resource.rsmrc_len; i++) {
1412 rsmresource_blk_t *blk;
1414 blk = rsm_resource.rsmrc_root[i];
1415 if (blk == NULL) {
1416 continue;
1418 for (j = 0; j < RSMRC_BLKSZ; j++) {
1419 if (blk->rsmrcblk_blks[j] != NULL) {
1420 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1421 "Not null slot %d, %lx\n", j,
1422 (size_t)blk->rsmrcblk_blks[j]));
1425 kmem_free(blk, sizeof (*blk));
1426 rsm_resource.rsmrc_root[i] = NULL;
1428 if (rsm_resource.rsmrc_root) {
1429 i = rsm_resource.rsmrc_sz * (int)sizeof (rsmresource_blk_t *);
1430 kmem_free(rsm_resource.rsmrc_root, (uint_t)i);
1431 rsm_resource.rsmrc_root = NULL;
1432 rsm_resource.rsmrc_len = 0;
1433 rsm_resource.rsmrc_sz = 0;
1436 DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1437 "rsmresource_destroy done\n"));
1439 rw_exit(&rsm_resource.rsmrc_lock);
1443 /* ******************** Generic Key Hash Table Management ********* */
1444 static rsmresource_t *
1445 rsmhash_lookup(rsmhash_table_t *rhash, rsm_memseg_id_t key,
1446 rsm_resource_state_t state)
1448 rsmresource_t *p;
1449 uint_t hashval;
1450 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1452 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_lookup enter\n"));
1454 hashval = rsmhash(key);
1456 DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmhash_lookup %u=%d\n",
1457 key, hashval));
1459 rw_enter(&rhash->rsmhash_rw, RW_READER);
1461 p = (rsmresource_t *)rsmhash_getbkt(rhash, hashval);
1463 for (; p; p = p->rsmrc_next) {
1464 if (p->rsmrc_key == key) {
1465 /* acquire resource lock */
1466 RSMRC_LOCK(p);
1467 break;
1471 rw_exit(&rhash->rsmhash_rw);
1473 if (p != NULL && p->rsmrc_state != state) {
1474 /* state changed, release lock and return null */
1475 RSMRC_UNLOCK(p);
1476 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
1477 "rsmhash_lookup done: state changed\n"));
1478 return (NULL);
1481 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_lookup done\n"));
1483 return (p);
1486 static void
1487 rsmhash_rm(rsmhash_table_t *rhash, rsmresource_t *rcelm)
1489 rsmresource_t *p, **back;
1490 uint_t hashval;
1491 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1493 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_rm enter\n"));
1495 hashval = rsmhash(rcelm->rsmrc_key);
1497 DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmhash_rm %u=%d\n",
1498 rcelm->rsmrc_key, hashval));
1501 * It's ok not to find the segment.
1503 rw_enter(&rhash->rsmhash_rw, RW_WRITER);
1505 back = (rsmresource_t **)rsmhash_bktaddr(rhash, hashval);
1507 for (; (p = *back) != NULL; back = &p->rsmrc_next) {
1508 if (p == rcelm) {
1509 *back = rcelm->rsmrc_next;
1510 break;
1514 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_rm done\n"));
1516 rw_exit(&rhash->rsmhash_rw);
1519 static int
1520 rsmhash_add(rsmhash_table_t *rhash, rsmresource_t *new, rsm_memseg_id_t key,
1521 int dup_check, rsm_resource_state_t state)
1523 rsmresource_t *p = NULL, **bktp;
1524 uint_t hashval;
1525 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1527 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_add enter\n"));
1529 /* lock table */
1530 rw_enter(&rhash->rsmhash_rw, RW_WRITER);
1533 * If the current resource state is other than the state passed in
1534 * then the resource is (probably) already on the list. eg. for an
1535 * import segment if the state is not RSM_STATE_NEW then it's on the
1536 * list already.
1538 RSMRC_LOCK(new);
1539 if (new->rsmrc_state != state) {
1540 RSMRC_UNLOCK(new);
1541 rw_exit(&rhash->rsmhash_rw);
1542 return (RSMERR_BAD_SEG_HNDL);
1545 hashval = rsmhash(key);
1546 DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmhash_add %d\n", hashval));
1548 if (dup_check) {
1550 * Used for checking export segments; don't want to have
1551 * the same key used for multiple segments.
1554 p = (rsmresource_t *)rsmhash_getbkt(rhash, hashval);
1556 for (; p; p = p->rsmrc_next) {
1557 if (p->rsmrc_key == key) {
1558 RSMRC_UNLOCK(new);
1559 break;
1564 if (p == NULL) {
1565 /* Key doesn't exist, add it */
1567 bktp = (rsmresource_t **)rsmhash_bktaddr(rhash, hashval);
1569 new->rsmrc_key = key;
1570 new->rsmrc_next = *bktp;
1571 *bktp = new;
1574 rw_exit(&rhash->rsmhash_rw);
1576 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_add done\n"));
1578 return (p == NULL ? RSM_SUCCESS : RSMERR_SEGID_IN_USE);
1582 * XOR each byte of the key.
1584 static uint_t
1585 rsmhash(rsm_memseg_id_t key)
1587 uint_t hash = key;
1589 hash ^= (key >> 8);
1590 hash ^= (key >> 16);
1591 hash ^= (key >> 24);
1593 return (hash % rsm_hash_size);
1598 * generic function to get a specific bucket
1600 static void *
1601 rsmhash_getbkt(rsmhash_table_t *rhash, uint_t hashval)
1604 if (rhash->bucket == NULL)
1605 return (NULL);
1606 else
1607 return ((void *)rhash->bucket[hashval]);
1611 * generic function to get a specific bucket's address
1613 static void **
1614 rsmhash_bktaddr(rsmhash_table_t *rhash, uint_t hashval)
1616 if (rhash->bucket == NULL)
1617 return (NULL);
1618 else
1619 return ((void **)&(rhash->bucket[hashval]));
1623 * generic function to alloc a hash table
1625 static void
1626 rsmhash_alloc(rsmhash_table_t *rhash, int size)
1628 rhash->bucket = (rsmresource_t **)
1629 kmem_zalloc(size * sizeof (rsmresource_t *), KM_SLEEP);
1633 * generic function to free a hash table
1635 static void
1636 rsmhash_free(rsmhash_table_t *rhash, int size)
1639 kmem_free((void *)rhash->bucket, size * sizeof (caddr_t));
1640 rhash->bucket = NULL;
1643 /* *********************** Exported Segment Key Management ************ */
1645 #define rsmexport_add(new, key) \
1646 rsmhash_add(&rsm_export_segs, (rsmresource_t *)new, key, 1, \
1647 RSM_STATE_BIND)
1649 #define rsmexport_rm(arg) \
1650 rsmhash_rm(&rsm_export_segs, (rsmresource_t *)(arg))
1652 #define rsmexport_lookup(key) \
1653 (rsmseg_t *)rsmhash_lookup(&rsm_export_segs, key, RSM_STATE_EXPORT)
1655 /* ************************** Import Segment List Management ********** */
1658 * Add segment to import list. This will be useful for paging and loopback
1659 * segment unloading.
1661 #define rsmimport_add(arg, key) \
1662 rsmhash_add(&rsm_import_segs, (rsmresource_t *)(arg), (key), 0, \
1663 RSM_STATE_NEW)
1665 #define rsmimport_rm(arg) \
1666 rsmhash_rm(&rsm_import_segs, (rsmresource_t *)(arg))
1669 * #define rsmimport_lookup(key) \
1670 * (rsmseg_t *)rsmhash_lookup(&rsm_import_segs, (key), RSM_STATE_CONNECT)
1674 * increase the ref count and make the import segment point to the
1675 * shared data structure. Return a pointer to the share data struct
1676 * and the shared data struct is locked upon return
1678 static rsm_import_share_t *
1679 rsmshare_get(rsm_memseg_id_t key, rsm_node_id_t node, adapter_t *adapter,
1680 rsmseg_t *segp)
1682 uint_t hash;
1683 rsmresource_t *p;
1684 rsm_import_share_t *shdatap;
1685 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1687 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmshare_get enter\n"));
1689 hash = rsmhash(key);
1690 /* lock table */
1691 rw_enter(&rsm_import_segs.rsmhash_rw, RW_WRITER);
1692 DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmshare_get:key=%u, hash=%d\n",
1693 key, hash));
1695 p = (rsmresource_t *)rsmhash_getbkt(&rsm_import_segs, hash);
1697 for (; p; p = p->rsmrc_next) {
1699 * Look for an entry that is importing the same exporter
1700 * with the share data structure allocated.
1702 if ((p->rsmrc_key == key) &&
1703 (p->rsmrc_node == node) &&
1704 (p->rsmrc_adapter == adapter) &&
1705 (((rsmseg_t *)p)->s_share != NULL)) {
1706 shdatap = ((rsmseg_t *)p)->s_share;
1707 break;
1711 if (p == NULL) {
1712 /* we are the first importer, create the shared data struct */
1713 shdatap = kmem_zalloc(sizeof (rsm_import_share_t), KM_SLEEP);
1714 shdatap->rsmsi_state = RSMSI_STATE_NEW;
1715 shdatap->rsmsi_segid = key;
1716 shdatap->rsmsi_node = node;
1717 mutex_init(&shdatap->rsmsi_lock, NULL, MUTEX_DRIVER, NULL);
1718 cv_init(&shdatap->rsmsi_cv, NULL, CV_DRIVER, 0);
1721 rsmseglock_acquire(segp);
1723 /* we grab the shared lock before returning from this function */
1724 mutex_enter(&shdatap->rsmsi_lock);
1726 shdatap->rsmsi_refcnt++;
1727 segp->s_share = shdatap;
1729 rsmseglock_release(segp);
1731 rw_exit(&rsm_import_segs.rsmhash_rw);
1733 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmshare_get done\n"));
1735 return (shdatap);
1739 * the shared data structure should be locked before calling
1740 * rsmsharecv_signal().
1741 * Change the state and signal any waiting segments.
1743 void
1744 rsmsharecv_signal(rsmseg_t *seg, int oldstate, int newstate)
1746 ASSERT(rsmsharelock_held(seg));
1748 if (seg->s_share->rsmsi_state == oldstate) {
1749 seg->s_share->rsmsi_state = newstate;
1750 cv_broadcast(&seg->s_share->rsmsi_cv);
1755 * Add to the hash table
1757 static void
1758 importer_list_add(rsm_node_id_t node, rsm_memseg_id_t key, rsm_addr_t hwaddr,
1759 void *cookie)
1762 importing_token_t *head;
1763 importing_token_t *new_token;
1764 int index;
1766 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1768 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_add enter\n"));
1770 new_token = kmem_zalloc(sizeof (importing_token_t), KM_SLEEP);
1771 new_token->importing_node = node;
1772 new_token->key = key;
1773 new_token->import_segment_cookie = cookie;
1774 new_token->importing_adapter_hwaddr = hwaddr;
1776 index = rsmhash(key);
1778 mutex_enter(&importer_list.lock);
1780 head = importer_list.bucket[index];
1781 importer_list.bucket[index] = new_token;
1782 new_token->next = head;
1783 mutex_exit(&importer_list.lock);
1785 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_add done\n"));
1788 static void
1789 importer_list_rm(rsm_node_id_t node, rsm_memseg_id_t key, void *cookie)
1792 importing_token_t *prev, *token = NULL;
1793 int index;
1794 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1796 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_rm enter\n"));
1798 index = rsmhash(key);
1800 mutex_enter(&importer_list.lock);
1802 token = importer_list.bucket[index];
1804 prev = token;
1805 while (token != NULL) {
1806 if (token->importing_node == node &&
1807 token->import_segment_cookie == cookie) {
1808 if (prev == token)
1809 importer_list.bucket[index] = token->next;
1810 else
1811 prev->next = token->next;
1812 kmem_free((void *)token, sizeof (*token));
1813 break;
1814 } else {
1815 prev = token;
1816 token = token->next;
1820 mutex_exit(&importer_list.lock);
1822 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_rm done\n"));
1827 /* **************************Segment Structure Management ************* */
1830 * Free segment structure
1832 static void
1833 rsmseg_free(rsmseg_t *seg)
1836 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1838 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_free enter\n"));
1840 /* need to take seglock here to avoid race with rsmmap_unmap() */
1841 rsmseglock_acquire(seg);
1842 if (seg->s_ckl != NULL) {
1843 /* Segment is still busy */
1844 seg->s_state = RSM_STATE_END;
1845 rsmseglock_release(seg);
1846 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
1847 "rsmseg_free done\n"));
1848 return;
1851 rsmseglock_release(seg);
1853 ASSERT(seg->s_state == RSM_STATE_END || seg->s_state == RSM_STATE_NEW);
1856 * If it's an importer decrement the refcount
1857 * and if its down to zero free the shared data structure.
1858 * This is where failures during rsm_connect() are unrefcounted
1860 if (seg->s_share != NULL) {
1862 ASSERT(seg->s_type == RSM_RESOURCE_IMPORT_SEGMENT);
1864 rsmsharelock_acquire(seg);
1866 ASSERT(seg->s_share->rsmsi_refcnt > 0);
1868 seg->s_share->rsmsi_refcnt--;
1870 if (seg->s_share->rsmsi_refcnt == 0) {
1871 rsmsharelock_release(seg);
1872 mutex_destroy(&seg->s_share->rsmsi_lock);
1873 cv_destroy(&seg->s_share->rsmsi_cv);
1874 kmem_free((void *)(seg->s_share),
1875 sizeof (rsm_import_share_t));
1876 } else {
1877 rsmsharelock_release(seg);
1880 * The following needs to be done after any
1881 * rsmsharelock calls which use seg->s_share.
1883 seg->s_share = NULL;
1886 cv_destroy(&seg->s_cv);
1887 mutex_destroy(&seg->s_lock);
1888 rsmacl_free(seg->s_acl, seg->s_acl_len);
1889 rsmpiacl_free(seg->s_acl_in, seg->s_acl_len);
1890 if (seg->s_adapter)
1891 rsmka_release_adapter(seg->s_adapter);
1893 kmem_free((void *)seg, sizeof (*seg));
1895 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_free done\n"));
1900 static rsmseg_t *
1901 rsmseg_alloc(minor_t num, struct cred *cred)
1903 rsmseg_t *new;
1904 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1906 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_alloc enter\n"));
1908 * allocate memory for new segment. This should be a segkmem cache.
1910 new = (rsmseg_t *)kmem_zalloc(sizeof (*new), KM_SLEEP);
1912 new->s_state = RSM_STATE_NEW;
1913 new->s_minor = num;
1914 new->s_acl_len = 0;
1915 new->s_cookie = NULL;
1916 new->s_adapter = NULL;
1918 new->s_mode = 0777 & ~PTOU((ttoproc(curthread)))->u_cmask;
1919 /* we don't have a key yet, will set at export/connect */
1920 new->s_uid = crgetuid(cred);
1921 new->s_gid = crgetgid(cred);
1923 mutex_init(&new->s_lock, NULL, MUTEX_DRIVER, NULL);
1924 cv_init(&new->s_cv, NULL, CV_DRIVER, 0);
1926 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_alloc done\n"));
1928 return (new);
1931 /* ******************************** Driver Open/Close/Poll *************** */
1933 /*ARGSUSED1*/
1934 static int
1935 rsm_open(dev_t *devp, int flag, int otyp, struct cred *cred)
1937 minor_t rnum;
1938 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL| RSM_DDI);
1940 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_open enter\n"));
1942 * Char only
1944 if (otyp != OTYP_CHR) {
1945 DBG_PRINTF((category, RSM_ERR, "rsm_open: bad otyp\n"));
1946 return (EINVAL);
1950 * Only zero can be opened, clones are used for resources.
1952 if (getminor(*devp) != RSM_DRIVER_MINOR) {
1953 DBG_PRINTF((category, RSM_ERR,
1954 "rsm_open: bad minor %d\n", getminor(*devp)));
1955 return (ENODEV);
1958 if ((flag & FEXCL) != 0 && secpolicy_excl_open(cred) != 0) {
1959 DBG_PRINTF((category, RSM_ERR, "rsm_open: bad perm\n"));
1960 return (EPERM);
1963 if (!(flag & FWRITE)) {
1965 * The library function _rsm_librsm_init calls open for
1966 * /dev/rsm with flag set to O_RDONLY. We want a valid
1967 * file descriptor to be returned for minor device zero.
1970 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
1971 "rsm_open RDONLY done\n"));
1972 return (DDI_SUCCESS);
1976 * - allocate new minor number and segment.
1977 * - add segment to list of all segments.
1978 * - set minordev data to segment
1979 * - update devp argument to new device
1980 * - update s_cred to cred; make sure you do crhold(cred);
1983 /* allocate a new resource number */
1984 if (rsmresource_alloc(&rnum) == RSM_SUCCESS) {
1986 * We will bind this minor to a specific resource in first
1987 * ioctl
1989 *devp = makedevice(getmajor(*devp), rnum);
1990 } else {
1991 return (EAGAIN);
1994 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_open done\n"));
1995 return (DDI_SUCCESS);
1998 static void
1999 rsmseg_close(rsmseg_t *seg, int force_flag)
2001 int e = RSM_SUCCESS;
2003 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL| RSM_DDI);
2005 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_close enter\n"));
2007 rsmseglock_acquire(seg);
2008 if (!force_flag && (seg->s_hdr.rsmrc_type ==
2009 RSM_RESOURCE_EXPORT_SEGMENT)) {
2011 * If we are processing rsm_close wait for force_destroy
2012 * processing to complete since force_destroy processing
2013 * needs to finish first before we can free the segment.
2014 * force_destroy is only for export segments
2016 while (seg->s_flags & RSM_FORCE_DESTROY_WAIT) {
2017 cv_wait(&seg->s_cv, &seg->s_lock);
2020 rsmseglock_release(seg);
2022 /* It's ok to read the state without a lock */
2023 switch (seg->s_state) {
2024 case RSM_STATE_EXPORT:
2025 case RSM_STATE_EXPORT_QUIESCING:
2026 case RSM_STATE_EXPORT_QUIESCED:
2027 e = rsm_unpublish(seg, 1);
2028 /* FALLTHRU */
2029 case RSM_STATE_BIND_QUIESCED:
2030 /* FALLTHRU */
2031 case RSM_STATE_BIND:
2032 e = rsm_unbind(seg);
2033 if (e != RSM_SUCCESS && force_flag == 1)
2034 return;
2035 ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_EXPORT_SEGMENT);
2036 /* FALLTHRU */
2037 case RSM_STATE_NEW_QUIESCED:
2038 rsmseglock_acquire(seg);
2039 seg->s_state = RSM_STATE_NEW;
2040 cv_broadcast(&seg->s_cv);
2041 rsmseglock_release(seg);
2042 break;
2043 case RSM_STATE_NEW:
2044 break;
2045 case RSM_STATE_ZOMBIE:
2047 * Segments in this state have been removed off the
2048 * exported segments list and have been unpublished
2049 * and unbind. These segments have been removed during
2050 * a callback to the rsm_export_force_destroy, which
2051 * is called for the purpose of unlocking these
2052 * exported memory segments when a process exits but
2053 * leaves the segments locked down since rsm_close is
2054 * is not called for the segments. This can happen
2055 * when a process calls fork or exec and then exits.
2056 * Once the segments are in the ZOMBIE state, all that
2057 * remains is to destroy them when rsm_close is called.
2058 * This is done here. Thus, for such segments the
2059 * the state is changed to new so that later in this
2060 * function rsmseg_free is called.
2062 rsmseglock_acquire(seg);
2063 seg->s_state = RSM_STATE_NEW;
2064 rsmseglock_release(seg);
2065 break;
2066 case RSM_STATE_MAP_QUIESCE:
2067 case RSM_STATE_ACTIVE:
2068 /* Disconnect will handle the unmap */
2069 case RSM_STATE_CONN_QUIESCE:
2070 case RSM_STATE_CONNECT:
2071 case RSM_STATE_DISCONNECT:
2072 ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
2073 (void) rsm_disconnect(seg);
2074 break;
2075 case RSM_STATE_MAPPING:
2076 /*FALLTHRU*/
2077 case RSM_STATE_END:
2078 DBG_PRINTF((category, RSM_ERR,
2079 "Invalid segment state %d in rsm_close\n", seg->s_state));
2080 break;
2081 default:
2082 DBG_PRINTF((category, RSM_ERR,
2083 "Invalid segment state %d in rsm_close\n", seg->s_state));
2084 break;
2088 * check state.
2089 * - make sure you do crfree(s_cred);
2090 * release segment and minor number
2092 ASSERT(seg->s_state == RSM_STATE_NEW);
2095 * The export_force_destroy callback is created to unlock
2096 * the exported segments of a process
2097 * when the process does a fork or exec and then exits calls this
2098 * function with the force flag set to 1 which indicates that the
2099 * segment state must be converted to ZOMBIE. This state means that the
2100 * segments still exist and have been unlocked and most importantly the
2101 * only operation allowed is to destroy them on an rsm_close.
2103 if (force_flag) {
2104 rsmseglock_acquire(seg);
2105 seg->s_state = RSM_STATE_ZOMBIE;
2106 rsmseglock_release(seg);
2107 } else {
2108 rsmseg_free(seg);
2111 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_close done\n"));
2114 static int
2115 rsm_close(dev_t dev, int flag, int otyp, cred_t *cred)
2117 minor_t rnum = getminor(dev);
2118 rsmresource_t *res;
2119 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL| RSM_DDI);
2121 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_close enter\n"));
2123 flag = flag; cred = cred;
2125 if (otyp != OTYP_CHR)
2126 return (EINVAL);
2128 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rnum = %d\n", rnum));
2131 * At this point we are the last reference to the resource.
2132 * Free resource number from resource table.
2133 * It's ok to remove number before we free the segment.
2134 * We need to lock the resource to protect against remote calls.
2136 if (rnum == RSM_DRIVER_MINOR ||
2137 (res = rsmresource_free(rnum)) == NULL) {
2138 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_close done\n"));
2139 return (DDI_SUCCESS);
2142 switch (res->rsmrc_type) {
2143 case RSM_RESOURCE_EXPORT_SEGMENT:
2144 case RSM_RESOURCE_IMPORT_SEGMENT:
2145 rsmseg_close((rsmseg_t *)res, 0);
2146 break;
2147 case RSM_RESOURCE_BAR:
2148 DBG_PRINTF((category, RSM_ERR, "bad resource in rsm_close\n"));
2149 break;
2150 default:
2151 break;
2154 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_close done\n"));
2156 return (DDI_SUCCESS);
2160 * rsm_inc_pgcnt
2162 * Description: increment rsm page counter.
2164 * Parameters: pgcnt_t pnum; number of pages to be used
2166 * Returns: RSM_SUCCESS if memory limit not exceeded
2167 * ENOSPC if memory limit exceeded. In this case, the
2168 * page counter remains unchanged.
2171 static int
2172 rsm_inc_pgcnt(pgcnt_t pnum)
2174 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2175 if (rsm_pgcnt_max == 0) { /* no upper limit has been set */
2176 return (RSM_SUCCESS);
2179 mutex_enter(&rsm_pgcnt_lock);
2181 if (rsm_pgcnt + pnum > rsm_pgcnt_max) {
2182 /* ensure that limits have not been exceeded */
2183 mutex_exit(&rsm_pgcnt_lock);
2184 return (RSMERR_INSUFFICIENT_MEM);
2187 rsm_pgcnt += pnum;
2188 DBG_PRINTF((category, RSM_DEBUG, "rsm_pgcnt incr to %d.\n",
2189 rsm_pgcnt));
2190 mutex_exit(&rsm_pgcnt_lock);
2192 return (RSM_SUCCESS);
2196 * rsm_dec_pgcnt
2198 * Description: decrement rsm page counter.
2200 * Parameters: pgcnt_t pnum; number of pages freed
2203 static void
2204 rsm_dec_pgcnt(pgcnt_t pnum)
2206 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2208 if (rsm_pgcnt_max == 0) { /* no upper limit has been set */
2209 return;
2212 mutex_enter(&rsm_pgcnt_lock);
2213 ASSERT(rsm_pgcnt >= pnum);
2214 rsm_pgcnt -= pnum;
2215 DBG_PRINTF((category, RSM_DEBUG, "rsm_pgcnt decr to %d.\n",
2216 rsm_pgcnt));
2217 mutex_exit(&rsm_pgcnt_lock);
2220 static struct umem_callback_ops rsm_as_ops = {
2221 UMEM_CALLBACK_VERSION, /* version number */
2222 rsm_export_force_destroy,
2225 static int
2226 rsm_bind_pages(ddi_umem_cookie_t *cookie, caddr_t vaddr, size_t len,
2227 proc_t *procp)
2229 int error = RSM_SUCCESS;
2230 ulong_t pnum;
2231 struct umem_callback_ops *callbackops = &rsm_as_ops;
2233 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2235 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind_pages enter\n"));
2238 * Make sure vaddr and len are aligned on a page boundary
2240 if ((uintptr_t)vaddr & (PAGESIZE - 1)) {
2241 return (RSMERR_BAD_ADDR);
2244 if (len & (PAGESIZE - 1)) {
2245 return (RSMERR_BAD_LENGTH);
2249 * Find number of pages
2251 pnum = btopr(len);
2252 error = rsm_inc_pgcnt(pnum);
2253 if (error != RSM_SUCCESS) {
2254 DBG_PRINTF((category, RSM_ERR,
2255 "rsm_bind_pages:mem limit exceeded\n"));
2256 return (RSMERR_INSUFFICIENT_MEM);
2259 error = umem_lockmemory(vaddr, len,
2260 DDI_UMEMLOCK_WRITE|DDI_UMEMLOCK_READ|DDI_UMEMLOCK_LONGTERM,
2261 cookie,
2262 callbackops, procp);
2264 if (error) {
2265 rsm_dec_pgcnt(pnum);
2266 DBG_PRINTF((category, RSM_ERR,
2267 "rsm_bind_pages:ddi_umem_lock failed\n"));
2269 * ddi_umem_lock, in the case of failure, returns one of
2270 * the following three errors. These are translated into
2271 * the RSMERR namespace and returned.
2273 if (error == EFAULT)
2274 return (RSMERR_BAD_ADDR);
2275 else if (error == EACCES)
2276 return (RSMERR_PERM_DENIED);
2277 else
2278 return (RSMERR_INSUFFICIENT_MEM);
2281 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind_pages done\n"));
2283 return (error);
2287 static int
2288 rsm_unbind_pages(rsmseg_t *seg)
2290 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2292 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind_pages enter\n"));
2294 ASSERT(rsmseglock_held(seg));
2296 if (seg->s_cookie != NULL) {
2297 /* unlock address range */
2298 ddi_umem_unlock(seg->s_cookie);
2299 rsm_dec_pgcnt(btopr(seg->s_len));
2300 seg->s_cookie = NULL;
2303 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind_pages done\n"));
2305 return (RSM_SUCCESS);
2309 static int
2310 rsm_bind(rsmseg_t *seg, rsm_ioctlmsg_t *msg, intptr_t dataptr, int mode)
2312 int e;
2313 adapter_t *adapter;
2314 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2316 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind enter\n"));
2318 adapter = rsm_getadapter(msg, mode);
2319 if (adapter == NULL) {
2320 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2321 "rsm_bind done:no adapter\n"));
2322 return (RSMERR_CTLR_NOT_PRESENT);
2325 /* lock address range */
2326 if (msg->vaddr == NULL) {
2327 rsmka_release_adapter(adapter);
2328 DBG_PRINTF((category, RSM_ERR,
2329 "rsm: rsm_bind done: invalid vaddr\n"));
2330 return (RSMERR_BAD_ADDR);
2332 if (msg->len <= 0) {
2333 rsmka_release_adapter(adapter);
2334 DBG_PRINTF((category, RSM_ERR,
2335 "rsm_bind: invalid length\n"));
2336 return (RSMERR_BAD_LENGTH);
2339 /* Lock segment */
2340 rsmseglock_acquire(seg);
2342 while (seg->s_state == RSM_STATE_NEW_QUIESCED) {
2343 if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
2344 DBG_PRINTF((category, RSM_DEBUG,
2345 "rsm_bind done: cv_wait INTERRUPTED"));
2346 rsmka_release_adapter(adapter);
2347 rsmseglock_release(seg);
2348 return (RSMERR_INTERRUPTED);
2352 ASSERT(seg->s_state == RSM_STATE_NEW);
2354 ASSERT(seg->s_cookie == NULL);
2356 e = rsm_bind_pages(&seg->s_cookie, msg->vaddr, msg->len, curproc);
2357 if (e == RSM_SUCCESS) {
2358 seg->s_flags |= RSM_USER_MEMORY;
2359 if (msg->perm & RSM_ALLOW_REBIND) {
2360 seg->s_flags |= RSMKA_ALLOW_UNBIND_REBIND;
2362 if (msg->perm & RSM_CREATE_SEG_DONTWAIT) {
2363 seg->s_flags |= RSMKA_SET_RESOURCE_DONTWAIT;
2365 seg->s_region.r_vaddr = msg->vaddr;
2367 * Set the s_pid value in the segment structure. This is used
2368 * to identify exported segments belonging to a particular
2369 * process so that when the process exits, these segments can
2370 * be unlocked forcefully even if rsm_close is not called on
2371 * process exit since there maybe other processes referencing
2372 * them (for example on a fork or exec).
2373 * The s_pid value is also used to authenticate the process
2374 * doing a publish or unpublish on the export segment. Only
2375 * the creator of the export segment has a right to do a
2376 * publish or unpublish and unbind on the segment.
2378 seg->s_pid = ddi_get_pid();
2379 seg->s_len = msg->len;
2380 seg->s_state = RSM_STATE_BIND;
2381 seg->s_adapter = adapter;
2382 seg->s_proc = curproc;
2383 } else {
2384 rsmka_release_adapter(adapter);
2385 DBG_PRINTF((category, RSM_WARNING,
2386 "unable to lock down pages\n"));
2389 msg->rnum = seg->s_minor;
2390 /* Unlock segment */
2391 rsmseglock_release(seg);
2393 if (e == RSM_SUCCESS) {
2394 /* copyout the resource number */
2395 #ifdef _MULTI_DATAMODEL
2396 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
2397 rsm_ioctlmsg32_t msg32;
2399 msg32.rnum = msg->rnum;
2400 if (ddi_copyout((caddr_t)&msg32.rnum,
2401 (caddr_t)&((rsm_ioctlmsg32_t *)dataptr)->rnum,
2402 sizeof (minor_t), mode)) {
2403 rsmka_release_adapter(adapter);
2404 e = RSMERR_BAD_ADDR;
2407 #endif
2408 if (ddi_copyout((caddr_t)&msg->rnum,
2409 (caddr_t)&((rsm_ioctlmsg_t *)dataptr)->rnum,
2410 sizeof (minor_t), mode)) {
2411 rsmka_release_adapter(adapter);
2412 e = RSMERR_BAD_ADDR;
2416 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind done\n"));
2418 return (e);
2421 static void
2422 rsm_remap_local_importers(rsm_node_id_t src_nodeid,
2423 rsm_memseg_id_t ex_segid, ddi_umem_cookie_t cookie)
2425 rsmresource_t *p = NULL;
2426 rsmhash_table_t *rhash = &rsm_import_segs;
2427 uint_t index;
2429 DBG_PRINTF((RSM_KERNEL_AGENT | RSM_FUNC_ALL, RSM_DEBUG_VERBOSE,
2430 "rsm_remap_local_importers enter\n"));
2432 index = rsmhash(ex_segid);
2434 rw_enter(&rhash->rsmhash_rw, RW_READER);
2436 p = rsmhash_getbkt(rhash, index);
2438 for (; p; p = p->rsmrc_next) {
2439 rsmseg_t *seg = (rsmseg_t *)p;
2440 rsmseglock_acquire(seg);
2442 * Change the s_cookie value of only the local importers
2443 * which have been mapped (in state RSM_STATE_ACTIVE).
2444 * Note that there is no need to change the s_cookie value
2445 * if the imported segment is in RSM_STATE_MAPPING since
2446 * eventually the s_cookie will be updated via the mapping
2447 * functionality.
2449 if ((seg->s_segid == ex_segid) && (seg->s_node == src_nodeid) &&
2450 (seg->s_state == RSM_STATE_ACTIVE)) {
2451 seg->s_cookie = cookie;
2453 rsmseglock_release(seg);
2455 rw_exit(&rhash->rsmhash_rw);
2457 DBG_PRINTF((RSM_KERNEL_AGENT | RSM_FUNC_ALL, RSM_DEBUG_VERBOSE,
2458 "rsm_remap_local_importers done\n"));
2461 static int
2462 rsm_rebind(rsmseg_t *seg, rsm_ioctlmsg_t *msg)
2464 int e;
2465 adapter_t *adapter;
2466 ddi_umem_cookie_t cookie;
2467 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2469 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_rebind enter\n"));
2471 /* Check for permissions to rebind */
2472 if (!(seg->s_flags & RSMKA_ALLOW_UNBIND_REBIND)) {
2473 return (RSMERR_REBIND_NOT_ALLOWED);
2476 if (seg->s_pid != ddi_get_pid() &&
2477 ddi_get_pid() != 0) {
2478 DBG_PRINTF((category, RSM_ERR, "rsm_rebind: Not owner\n"));
2479 return (RSMERR_NOT_CREATOR);
2483 * We will not be allowing partial rebind and hence length passed
2484 * in must be same as segment length
2486 if (msg->vaddr == NULL) {
2487 DBG_PRINTF((category, RSM_ERR,
2488 "rsm_rebind done: null msg->vaddr\n"));
2489 return (RSMERR_BAD_ADDR);
2491 if (msg->len != seg->s_len) {
2492 DBG_PRINTF((category, RSM_ERR,
2493 "rsm_rebind: invalid length\n"));
2494 return (RSMERR_BAD_LENGTH);
2497 /* Lock segment */
2498 rsmseglock_acquire(seg);
2500 while ((seg->s_state == RSM_STATE_BIND_QUIESCED) ||
2501 (seg->s_state == RSM_STATE_EXPORT_QUIESCING) ||
2502 (seg->s_state == RSM_STATE_EXPORT_QUIESCED)) {
2503 if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
2504 rsmseglock_release(seg);
2505 DBG_PRINTF((category, RSM_DEBUG,
2506 "rsm_rebind done: cv_wait INTERRUPTED"));
2507 return (RSMERR_INTERRUPTED);
2511 /* verify segment state */
2512 if ((seg->s_state != RSM_STATE_BIND) &&
2513 (seg->s_state != RSM_STATE_EXPORT)) {
2514 /* Unlock segment */
2515 rsmseglock_release(seg);
2516 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2517 "rsm_rebind done: invalid state\n"));
2518 return (RSMERR_BAD_SEG_HNDL);
2521 ASSERT(seg->s_cookie != NULL);
2523 if (msg->vaddr == seg->s_region.r_vaddr) {
2524 rsmseglock_release(seg);
2525 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_rebind done\n"));
2526 return (RSM_SUCCESS);
2529 e = rsm_bind_pages(&cookie, msg->vaddr, msg->len, curproc);
2530 if (e == RSM_SUCCESS) {
2531 struct buf *xbuf;
2532 dev_t sdev = 0;
2533 rsm_memory_local_t mem;
2535 xbuf = ddi_umem_iosetup(cookie, 0, msg->len, B_WRITE,
2536 sdev, 0, NULL, DDI_UMEM_SLEEP);
2537 ASSERT(xbuf != NULL);
2539 mem.ms_type = RSM_MEM_BUF;
2540 mem.ms_bp = xbuf;
2542 adapter = seg->s_adapter;
2543 e = adapter->rsmpi_ops->rsm_rebind(
2544 seg->s_handle.out, 0, &mem,
2545 RSM_RESOURCE_DONTWAIT, NULL);
2547 if (e == RSM_SUCCESS) {
2549 * unbind the older pages, and unload local importers;
2550 * but don't disconnect importers
2552 (void) rsm_unbind_pages(seg);
2553 seg->s_cookie = cookie;
2554 seg->s_region.r_vaddr = msg->vaddr;
2555 rsm_remap_local_importers(my_nodeid, seg->s_segid,
2556 cookie);
2557 } else {
2559 * Unbind the pages associated with "cookie" by the
2560 * rsm_bind_pages calls prior to this. This is
2561 * similar to what is done in the rsm_unbind_pages
2562 * routine for the seg->s_cookie.
2564 ddi_umem_unlock(cookie);
2565 rsm_dec_pgcnt(btopr(msg->len));
2566 DBG_PRINTF((category, RSM_ERR,
2567 "rsm_rebind failed with %d\n", e));
2570 * At present there is no dependency on the existence of xbuf.
2571 * So we can free it here. If in the future this changes, it can
2572 * be freed sometime during the segment destroy.
2574 freerbuf(xbuf);
2577 /* Unlock segment */
2578 rsmseglock_release(seg);
2580 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_rebind done\n"));
2582 return (e);
2585 static int
2586 rsm_unbind(rsmseg_t *seg)
2588 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2590 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind enter\n"));
2592 rsmseglock_acquire(seg);
2594 /* verify segment state */
2595 if ((seg->s_state != RSM_STATE_BIND) &&
2596 (seg->s_state != RSM_STATE_BIND_QUIESCED)) {
2597 rsmseglock_release(seg);
2598 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2599 "rsm_unbind: invalid state\n"));
2600 return (RSMERR_BAD_SEG_HNDL);
2603 /* unlock current range */
2604 (void) rsm_unbind_pages(seg);
2606 if (seg->s_state == RSM_STATE_BIND) {
2607 seg->s_state = RSM_STATE_NEW;
2608 } else if (seg->s_state == RSM_STATE_BIND_QUIESCED) {
2609 seg->s_state = RSM_STATE_NEW_QUIESCED;
2612 rsmseglock_release(seg);
2614 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind done\n"));
2616 return (RSM_SUCCESS);
2619 /* **************************** Exporter Access List Management ******* */
2620 static void
2621 rsmacl_free(rsmapi_access_entry_t *acl, int acl_len)
2623 int acl_sz;
2624 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2626 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_free enter\n"));
2628 /* acl could be NULL */
2630 if (acl != NULL && acl_len > 0) {
2631 acl_sz = acl_len * sizeof (rsmapi_access_entry_t);
2632 kmem_free((void *)acl, acl_sz);
2635 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_free done\n"));
2638 static void
2639 rsmpiacl_free(rsm_access_entry_t *acl, int acl_len)
2641 int acl_sz;
2642 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2644 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_free enter\n"));
2646 if (acl != NULL && acl_len > 0) {
2647 acl_sz = acl_len * sizeof (rsm_access_entry_t);
2648 kmem_free((void *)acl, acl_sz);
2651 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_free done\n"));
2655 static int
2656 rsmacl_build(rsm_ioctlmsg_t *msg, int mode,
2657 rsmapi_access_entry_t **list, int *len, int loopback)
2659 rsmapi_access_entry_t *acl;
2660 int acl_len;
2661 int i;
2662 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2664 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_build enter\n"));
2666 *len = 0;
2667 *list = NULL;
2669 acl_len = msg->acl_len;
2670 if ((loopback && acl_len > 1) || (acl_len < 0) ||
2671 (acl_len > MAX_NODES)) {
2672 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2673 "rsmacl_build done: acl invalid\n"));
2674 return (RSMERR_BAD_ACL);
2677 if (acl_len > 0 && acl_len <= MAX_NODES) {
2678 size_t acl_size = acl_len * sizeof (rsmapi_access_entry_t);
2680 acl = kmem_alloc(acl_size, KM_SLEEP);
2682 if (ddi_copyin((caddr_t)msg->acl, (caddr_t)acl,
2683 acl_size, mode)) {
2684 kmem_free((void *) acl, acl_size);
2685 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2686 "rsmacl_build done: BAD_ADDR\n"));
2687 return (RSMERR_BAD_ADDR);
2691 * Verify access list
2693 for (i = 0; i < acl_len; i++) {
2694 if (acl[i].ae_node > MAX_NODES ||
2695 (loopback && (acl[i].ae_node != my_nodeid)) ||
2696 acl[i].ae_permission > RSM_ACCESS_TRUSTED) {
2697 /* invalid entry */
2698 kmem_free((void *) acl, acl_size);
2699 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2700 "rsmacl_build done: EINVAL\n"));
2701 return (RSMERR_BAD_ACL);
2705 *len = acl_len;
2706 *list = acl;
2709 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_build done\n"));
2711 return (DDI_SUCCESS);
2714 static int
2715 rsmpiacl_create(rsmapi_access_entry_t *src, rsm_access_entry_t **dest,
2716 int acl_len, adapter_t *adapter)
2718 rsm_access_entry_t *acl;
2719 rsm_addr_t hwaddr;
2720 int i;
2721 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2723 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_create enter\n"));
2725 if (src != NULL) {
2726 size_t acl_size = acl_len * sizeof (rsm_access_entry_t);
2727 acl = kmem_alloc(acl_size, KM_SLEEP);
2730 * translate access list
2732 for (i = 0; i < acl_len; i++) {
2733 if (src[i].ae_node == my_nodeid) {
2734 acl[i].ae_addr = adapter->hwaddr;
2735 } else {
2736 hwaddr = get_remote_hwaddr(adapter,
2737 src[i].ae_node);
2738 if ((int64_t)hwaddr < 0) {
2739 /* invalid hwaddr */
2740 kmem_free((void *) acl, acl_size);
2741 DBG_PRINTF((category,
2742 RSM_DEBUG_VERBOSE,
2743 "rsmpiacl_create done:"
2744 "EINVAL hwaddr\n"));
2745 return (RSMERR_INTERNAL_ERROR);
2747 acl[i].ae_addr = hwaddr;
2749 /* rsmpi understands only RSM_PERM_XXXX */
2750 acl[i].ae_permission =
2751 src[i].ae_permission & RSM_PERM_RDWR;
2753 *dest = acl;
2754 } else {
2755 *dest = NULL;
2758 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_create done\n"));
2760 return (RSM_SUCCESS);
2763 static int
2764 rsmsegacl_validate(rsmipc_request_t *req, rsm_node_id_t rnode,
2765 rsmipc_reply_t *reply)
2768 int i;
2769 rsmseg_t *seg;
2770 rsm_memseg_id_t key = req->rsmipc_key;
2771 rsm_permission_t perm = req->rsmipc_perm;
2772 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2774 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2775 "rsmsegacl_validate enter\n"));
2778 * Find segment and grab its lock. The reason why we grab the segment
2779 * lock in side the search is to avoid the race when the segment is
2780 * being deleted and we already have a pointer to it.
2782 seg = rsmexport_lookup(key);
2783 if (!seg) {
2784 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2785 "rsmsegacl_validate done: %u ENXIO\n", key));
2786 return (RSMERR_SEG_NOT_PUBLISHED);
2789 ASSERT(rsmseglock_held(seg));
2790 ASSERT(seg->s_state == RSM_STATE_EXPORT);
2793 * We implement a 2-level protection scheme.
2794 * First, we check if local/remote host has access rights.
2795 * Second, we check if the user has access rights.
2797 * This routine only validates the rnode access_list
2799 if (seg->s_acl_len > 0) {
2801 * Check host access list
2803 ASSERT(seg->s_acl != NULL);
2804 for (i = 0; i < seg->s_acl_len; i++) {
2805 if (seg->s_acl[i].ae_node == rnode) {
2806 perm &= seg->s_acl[i].ae_permission;
2807 goto found;
2810 /* rnode is not found in the list */
2811 rsmseglock_release(seg);
2812 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2813 "rsmsegacl_validate done: EPERM\n"));
2814 return (RSMERR_SEG_NOT_PUBLISHED_TO_NODE);
2815 } else {
2816 /* use default owner creation umask */
2817 perm &= seg->s_mode;
2820 found:
2821 /* update perm for this node */
2822 reply->rsmipc_mode = perm;
2823 reply->rsmipc_uid = seg->s_uid;
2824 reply->rsmipc_gid = seg->s_gid;
2825 reply->rsmipc_segid = seg->s_segid;
2826 reply->rsmipc_seglen = seg->s_len;
2829 * Perm of requesting node is valid; source will validate user
2831 rsmseglock_release(seg);
2834 * Add the importer to the list right away, if connect fails
2835 * the importer will ask the exporter to remove it.
2837 importer_list_add(rnode, key, req->rsmipc_adapter_hwaddr,
2838 req->rsmipc_segment_cookie);
2840 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmsegacl_validate done\n"));
2842 return (RSM_SUCCESS);
2846 /* ************************** Exporter Calls ************************* */
2848 static int
2849 rsm_publish(rsmseg_t *seg, rsm_ioctlmsg_t *msg, intptr_t dataptr, int mode)
2851 int e;
2852 int acl_len;
2853 rsmapi_access_entry_t *acl;
2854 rsm_access_entry_t *rsmpi_acl;
2855 rsm_memory_local_t mem;
2856 struct buf *xbuf;
2857 dev_t sdev = 0;
2858 adapter_t *adapter;
2859 rsm_memseg_id_t segment_id = 0;
2860 int loopback_flag = 0;
2861 int create_flags = 0;
2862 rsm_resource_callback_t callback_flag;
2863 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2865 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_publish enter\n"));
2867 if (seg->s_adapter == &loopback_adapter)
2868 loopback_flag = 1;
2870 if (seg->s_pid != ddi_get_pid() &&
2871 ddi_get_pid() != 0) {
2872 DBG_PRINTF((category, RSM_ERR,
2873 "rsm_publish: Not creator\n"));
2874 return (RSMERR_NOT_CREATOR);
2878 * Get per node access list
2880 e = rsmacl_build(msg, mode, &acl, &acl_len, loopback_flag);
2881 if (e != DDI_SUCCESS) {
2882 DBG_PRINTF((category, RSM_ERR,
2883 "rsm_publish done: rsmacl_build failed\n"));
2884 return (e);
2888 * The application provided msg->key is used for resolving a
2889 * segment id according to the following:
2890 * key = 0 Kernel Agent selects the segment id
2891 * key <= RSM_DLPI_ID_END Reserved for system usage except
2892 * RSMLIB range
2893 * key < RSM_USER_APP_ID_BASE segment id = key
2894 * key >= RSM_USER_APP_ID_BASE Reserved for KA selections
2896 * rsm_nextavail_segmentid is initialized to 0x80000000 and
2897 * overflows to zero after 0x80000000 allocations.
2898 * An algorithm is needed which allows reinitialization and provides
2899 * for reallocation after overflow. For now, ENOMEM is returned
2900 * once the overflow condition has occurred.
2902 if (msg->key == 0) {
2903 mutex_enter(&rsm_lock);
2904 segment_id = rsm_nextavail_segmentid;
2905 if (segment_id != 0) {
2906 rsm_nextavail_segmentid++;
2907 mutex_exit(&rsm_lock);
2908 } else {
2909 mutex_exit(&rsm_lock);
2910 DBG_PRINTF((category, RSM_ERR,
2911 "rsm_publish done: no more keys avlbl\n"));
2912 return (RSMERR_INSUFFICIENT_RESOURCES);
2914 } else if BETWEEN(msg->key, RSM_RSMLIB_ID_BASE, RSM_RSMLIB_ID_END)
2915 /* range reserved for internal use by base/ndi libraries */
2916 segment_id = msg->key;
2917 else if (msg->key <= RSM_DLPI_ID_END)
2918 return (RSMERR_RESERVED_SEGID);
2919 else if (msg->key <= (uint_t)RSM_USER_APP_ID_BASE -1)
2920 segment_id = msg->key;
2921 else {
2922 DBG_PRINTF((category, RSM_ERR,
2923 "rsm_publish done: invalid key %u\n", msg->key));
2924 return (RSMERR_RESERVED_SEGID);
2927 /* Add key to exportlist; The segment lock is held on success */
2928 e = rsmexport_add(seg, segment_id);
2929 if (e) {
2930 rsmacl_free(acl, acl_len);
2931 DBG_PRINTF((category, RSM_ERR,
2932 "rsm_publish done: export_add failed: %d\n", e));
2933 return (e);
2936 seg->s_segid = segment_id;
2938 if ((seg->s_state != RSM_STATE_BIND) &&
2939 (seg->s_state != RSM_STATE_BIND_QUIESCED)) {
2940 /* state changed since then, free acl and return */
2941 rsmseglock_release(seg);
2942 rsmexport_rm(seg);
2943 rsmacl_free(acl, acl_len);
2944 DBG_PRINTF((category, RSM_ERR,
2945 "rsm_publish done: segment in wrong state: %d\n",
2946 seg->s_state));
2947 return (RSMERR_BAD_SEG_HNDL);
2951 * If this is for a local memory handle and permissions are zero,
2952 * then the surrogate segment is very large and we want to skip
2953 * allocation of DVMA space.
2955 * Careful! If the user didn't use an ACL list, acl will be a NULL
2956 * pointer. Check that before dereferencing it.
2958 if (acl != (rsmapi_access_entry_t *)NULL) {
2959 if (acl[0].ae_node == my_nodeid && acl[0].ae_permission == 0)
2960 goto skipdriver;
2963 /* create segment */
2964 xbuf = ddi_umem_iosetup(seg->s_cookie, 0, seg->s_len, B_WRITE,
2965 sdev, 0, NULL, DDI_UMEM_SLEEP);
2966 ASSERT(xbuf != NULL);
2968 mem.ms_type = RSM_MEM_BUF;
2969 mem.ms_bp = xbuf;
2971 /* This call includes a bind operations */
2973 adapter = seg->s_adapter;
2975 * create a acl list with hwaddr for RSMPI publish
2977 e = rsmpiacl_create(acl, &rsmpi_acl, acl_len, adapter);
2979 if (e != RSM_SUCCESS) {
2980 rsmseglock_release(seg);
2981 rsmexport_rm(seg);
2982 rsmacl_free(acl, acl_len);
2983 freerbuf(xbuf);
2984 DBG_PRINTF((category, RSM_ERR,
2985 "rsm_publish done: rsmpiacl_create failed: %d\n", e));
2986 return (e);
2989 if (seg->s_state == RSM_STATE_BIND) {
2990 /* create segment */
2992 /* This call includes a bind operations */
2994 if (seg->s_flags & RSMKA_ALLOW_UNBIND_REBIND) {
2995 create_flags = RSM_ALLOW_UNBIND_REBIND;
2998 if (seg->s_flags & RSMKA_SET_RESOURCE_DONTWAIT) {
2999 callback_flag = RSM_RESOURCE_DONTWAIT;
3000 } else {
3001 callback_flag = RSM_RESOURCE_SLEEP;
3004 e = adapter->rsmpi_ops->rsm_seg_create(
3005 adapter->rsmpi_handle,
3006 &seg->s_handle.out, seg->s_len,
3007 create_flags, &mem,
3008 callback_flag, NULL);
3010 * At present there is no dependency on the existence of xbuf.
3011 * So we can free it here. If in the future this changes, it can
3012 * be freed sometime during the segment destroy.
3014 freerbuf(xbuf);
3016 if (e != RSM_SUCCESS) {
3017 rsmseglock_release(seg);
3018 rsmexport_rm(seg);
3019 rsmacl_free(acl, acl_len);
3020 rsmpiacl_free(rsmpi_acl, acl_len);
3021 DBG_PRINTF((category, RSM_ERR,
3022 "rsm_publish done: export_create failed: %d\n", e));
3024 * The following assertion ensures that the two errors
3025 * related to the length and its alignment do not occur
3026 * since they have been checked during export_create
3028 ASSERT(e != RSMERR_BAD_MEM_ALIGNMENT &&
3029 e != RSMERR_BAD_LENGTH);
3030 if (e == RSMERR_NOT_MEM)
3031 e = RSMERR_INSUFFICIENT_MEM;
3033 return (e);
3035 /* export segment, this should create an IMMU mapping */
3036 e = adapter->rsmpi_ops->rsm_publish(
3037 seg->s_handle.out,
3038 rsmpi_acl, acl_len,
3039 seg->s_segid,
3040 RSM_RESOURCE_DONTWAIT, NULL);
3042 if (e != RSM_SUCCESS) {
3043 adapter->rsmpi_ops->rsm_seg_destroy(seg->s_handle.out);
3044 rsmseglock_release(seg);
3045 rsmexport_rm(seg);
3046 rsmacl_free(acl, acl_len);
3047 rsmpiacl_free(rsmpi_acl, acl_len);
3048 DBG_PRINTF((category, RSM_ERR,
3049 "rsm_publish done: export_publish failed: %d\n",
3050 e));
3051 return (e);
3055 seg->s_acl_in = rsmpi_acl;
3057 skipdriver:
3058 /* defer s_acl/s_acl_len -> avoid crash in rsmseg_free */
3059 seg->s_acl_len = acl_len;
3060 seg->s_acl = acl;
3062 if (seg->s_state == RSM_STATE_BIND) {
3063 seg->s_state = RSM_STATE_EXPORT;
3064 } else if (seg->s_state == RSM_STATE_BIND_QUIESCED) {
3065 seg->s_state = RSM_STATE_EXPORT_QUIESCED;
3066 cv_broadcast(&seg->s_cv);
3069 rsmseglock_release(seg);
3072 * If the segment id was solicited, then return it in
3073 * the original incoming message.
3075 if (msg->key == 0) {
3076 msg->key = segment_id;
3077 #ifdef _MULTI_DATAMODEL
3078 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
3079 rsm_ioctlmsg32_t msg32;
3081 msg32.key = msg->key;
3082 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3083 "rsm_publish done\n"));
3084 return (ddi_copyout((caddr_t)&msg32,
3085 (caddr_t)dataptr, sizeof (msg32), mode));
3087 #endif
3088 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3089 "rsm_publish done\n"));
3090 return (ddi_copyout((caddr_t)msg,
3091 (caddr_t)dataptr, sizeof (*msg), mode));
3094 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_publish done\n"));
3095 return (DDI_SUCCESS);
3099 * This function modifies the access control list of an already published
3100 * segment. There is no effect on import segments which are already
3101 * connected.
3103 static int
3104 rsm_republish(rsmseg_t *seg, rsm_ioctlmsg_t *msg, int mode)
3106 rsmapi_access_entry_t *new_acl, *old_acl, *tmp_acl;
3107 rsm_access_entry_t *rsmpi_new_acl, *rsmpi_old_acl;
3108 int new_acl_len, old_acl_len, tmp_acl_len;
3109 int e, i;
3110 adapter_t *adapter;
3111 int loopback_flag = 0;
3112 rsm_memseg_id_t key;
3113 rsm_permission_t permission;
3114 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
3116 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_republish enter\n"));
3118 if ((seg->s_state != RSM_STATE_EXPORT) &&
3119 (seg->s_state != RSM_STATE_EXPORT_QUIESCED) &&
3120 (seg->s_state != RSM_STATE_EXPORT_QUIESCING))
3121 return (RSMERR_SEG_NOT_PUBLISHED);
3123 if (seg->s_pid != ddi_get_pid() &&
3124 ddi_get_pid() != 0) {
3125 DBG_PRINTF((category, RSM_ERR,
3126 "rsm_republish: Not owner\n"));
3127 return (RSMERR_NOT_CREATOR);
3130 if (seg->s_adapter == &loopback_adapter)
3131 loopback_flag = 1;
3134 * Build new list first
3136 e = rsmacl_build(msg, mode, &new_acl, &new_acl_len, loopback_flag);
3137 if (e) {
3138 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3139 "rsm_republish done: rsmacl_build failed %d", e));
3140 return (e);
3143 /* Lock segment */
3144 rsmseglock_acquire(seg);
3146 * a republish is in progress - REPUBLISH message is being
3147 * sent to the importers so wait for it to complete OR
3148 * wait till DR completes
3150 while (((seg->s_state == RSM_STATE_EXPORT) &&
3151 (seg->s_flags & RSM_REPUBLISH_WAIT)) ||
3152 (seg->s_state == RSM_STATE_EXPORT_QUIESCED) ||
3153 (seg->s_state == RSM_STATE_EXPORT_QUIESCING)) {
3154 if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
3155 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3156 "rsm_republish done: cv_wait INTERRUPTED"));
3157 rsmseglock_release(seg);
3158 rsmacl_free(new_acl, new_acl_len);
3159 return (RSMERR_INTERRUPTED);
3163 /* recheck if state is valid */
3164 if (seg->s_state != RSM_STATE_EXPORT) {
3165 rsmseglock_release(seg);
3166 rsmacl_free(new_acl, new_acl_len);
3167 return (RSMERR_SEG_NOT_PUBLISHED);
3170 key = seg->s_key;
3171 old_acl = seg->s_acl;
3172 old_acl_len = seg->s_acl_len;
3174 seg->s_acl = new_acl;
3175 seg->s_acl_len = new_acl_len;
3178 * This call will only be meaningful if and when the interconnect
3179 * layer makes use of the access list
3181 adapter = seg->s_adapter;
3183 * create a acl list with hwaddr for RSMPI publish
3185 e = rsmpiacl_create(new_acl, &rsmpi_new_acl, new_acl_len, adapter);
3187 if (e != RSM_SUCCESS) {
3188 seg->s_acl = old_acl;
3189 seg->s_acl_len = old_acl_len;
3190 rsmseglock_release(seg);
3191 rsmacl_free(new_acl, new_acl_len);
3192 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3193 "rsm_republish done: rsmpiacl_create failed %d", e));
3194 return (e);
3196 rsmpi_old_acl = seg->s_acl_in;
3197 seg->s_acl_in = rsmpi_new_acl;
3199 e = adapter->rsmpi_ops->rsm_republish(seg->s_handle.out,
3200 seg->s_acl_in, seg->s_acl_len,
3201 RSM_RESOURCE_DONTWAIT, NULL);
3203 if (e != RSM_SUCCESS) {
3204 seg->s_acl = old_acl;
3205 seg->s_acl_in = rsmpi_old_acl;
3206 seg->s_acl_len = old_acl_len;
3207 rsmseglock_release(seg);
3208 rsmacl_free(new_acl, new_acl_len);
3209 rsmpiacl_free(rsmpi_new_acl, new_acl_len);
3211 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3212 "rsm_republish done: rsmpi republish failed %d\n", e));
3213 return (e);
3216 /* create a tmp copy of the new acl */
3217 tmp_acl_len = new_acl_len;
3218 if (tmp_acl_len > 0) {
3219 tmp_acl = kmem_zalloc(new_acl_len*sizeof (*tmp_acl), KM_SLEEP);
3220 for (i = 0; i < tmp_acl_len; i++) {
3221 tmp_acl[i].ae_node = new_acl[i].ae_node;
3222 tmp_acl[i].ae_permission = new_acl[i].ae_permission;
3225 * The default permission of a node which was in the old
3226 * ACL but not in the new ACL is 0 ie no access.
3228 permission = 0;
3229 } else {
3231 * NULL acl means all importers can connect and
3232 * default permission will be owner creation umask
3234 tmp_acl = NULL;
3235 permission = seg->s_mode;
3238 /* make other republishers to wait for republish to complete */
3239 seg->s_flags |= RSM_REPUBLISH_WAIT;
3241 rsmseglock_release(seg);
3243 /* send the new perms to the importing nodes */
3244 rsm_send_republish(key, tmp_acl, tmp_acl_len, permission);
3246 rsmseglock_acquire(seg);
3247 seg->s_flags &= ~RSM_REPUBLISH_WAIT;
3248 /* wake up any one waiting for republish to complete */
3249 cv_broadcast(&seg->s_cv);
3250 rsmseglock_release(seg);
3252 rsmacl_free(tmp_acl, tmp_acl_len);
3253 rsmacl_free(old_acl, old_acl_len);
3254 rsmpiacl_free(rsmpi_old_acl, old_acl_len);
3256 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_republish done\n"));
3257 return (DDI_SUCCESS);
3260 static int
3261 rsm_unpublish(rsmseg_t *seg, int mode)
3263 rsmapi_access_entry_t *acl;
3264 rsm_access_entry_t *rsmpi_acl;
3265 int acl_len;
3266 int e;
3267 adapter_t *adapter;
3268 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
3270 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unpublish enter\n"));
3272 if (seg->s_pid != ddi_get_pid() &&
3273 ddi_get_pid() != 0) {
3274 DBG_PRINTF((category, RSM_ERR,
3275 "rsm_unpublish: Not creator\n"));
3276 return (RSMERR_NOT_CREATOR);
3279 rsmseglock_acquire(seg);
3281 * wait for QUIESCING to complete here before rsmexport_rm
3282 * is called because the SUSPEND_COMPLETE mesg which changes
3283 * the seg state from EXPORT_QUIESCING to EXPORT_QUIESCED and
3284 * signals the cv_wait needs to find it in the hashtable.
3286 while ((seg->s_state == RSM_STATE_EXPORT_QUIESCING) ||
3287 ((seg->s_state == RSM_STATE_EXPORT) && (seg->s_rdmacnt > 0))) {
3288 if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
3289 rsmseglock_release(seg);
3290 DBG_PRINTF((category, RSM_ERR,
3291 "rsm_unpublish done: cv_wait INTR qscing"
3292 "getv/putv in progress"));
3293 return (RSMERR_INTERRUPTED);
3297 /* verify segment state */
3298 if ((seg->s_state != RSM_STATE_EXPORT) &&
3299 (seg->s_state != RSM_STATE_EXPORT_QUIESCED)) {
3300 rsmseglock_release(seg);
3301 DBG_PRINTF((category, RSM_ERR,
3302 "rsm_unpublish done: bad state %x\n", seg->s_state));
3303 return (RSMERR_SEG_NOT_PUBLISHED);
3306 rsmseglock_release(seg);
3308 rsmexport_rm(seg);
3310 rsm_send_importer_disconnects(seg->s_segid, my_nodeid);
3312 rsmseglock_acquire(seg);
3314 * wait for republish to complete
3316 while ((seg->s_state == RSM_STATE_EXPORT) &&
3317 (seg->s_flags & RSM_REPUBLISH_WAIT)) {
3318 if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
3319 DBG_PRINTF((category, RSM_ERR,
3320 "rsm_unpublish done: cv_wait INTR repubing"));
3321 rsmseglock_release(seg);
3322 return (RSMERR_INTERRUPTED);
3326 if ((seg->s_state != RSM_STATE_EXPORT) &&
3327 (seg->s_state != RSM_STATE_EXPORT_QUIESCED)) {
3328 DBG_PRINTF((category, RSM_ERR,
3329 "rsm_unpublish done: invalid state"));
3330 rsmseglock_release(seg);
3331 return (RSMERR_SEG_NOT_PUBLISHED);
3335 * check for putv/get surrogate segment which was not published
3336 * to the driver.
3338 * Be certain to see if there is an ACL first! If this segment was
3339 * not published with an ACL, acl will be a null pointer. Check
3340 * that before dereferencing it.
3342 acl = seg->s_acl;
3343 if (acl != (rsmapi_access_entry_t *)NULL) {
3344 if (acl[0].ae_node == my_nodeid && acl[0].ae_permission == 0)
3345 goto bypass;
3348 /* The RSMPI unpublish/destroy has been done if seg is QUIESCED */
3349 if (seg->s_state == RSM_STATE_EXPORT_QUIESCED)
3350 goto bypass;
3352 adapter = seg->s_adapter;
3353 for (;;) {
3354 if (seg->s_state != RSM_STATE_EXPORT) {
3355 rsmseglock_release(seg);
3356 DBG_PRINTF((category, RSM_ERR,
3357 "rsm_unpublish done: bad state %x\n",
3358 seg->s_state));
3359 return (RSMERR_SEG_NOT_PUBLISHED);
3362 /* unpublish from adapter */
3363 e = adapter->rsmpi_ops->rsm_unpublish(seg->s_handle.out);
3365 if (e == RSM_SUCCESS) {
3366 break;
3369 if (e == RSMERR_SEG_IN_USE && mode == 1) {
3371 * wait for unpublish to succeed, it's busy.
3373 seg->s_flags |= RSM_EXPORT_WAIT;
3375 /* wait for a max of 1 ms - this is an empirical */
3376 /* value that was found by some minimal testing */
3377 /* can be fine tuned when we have better numbers */
3378 /* A long term fix would be to send cv_signal */
3379 /* from the intr callback routine */
3380 /* currently nobody signals this wait */
3381 (void) cv_reltimedwait(&seg->s_cv, &seg->s_lock,
3382 drv_usectohz(1000), TR_CLOCK_TICK);
3384 DBG_PRINTF((category, RSM_ERR,
3385 "rsm_unpublish: SEG_IN_USE\n"));
3387 seg->s_flags &= ~RSM_EXPORT_WAIT;
3388 } else {
3389 if (mode == 1) {
3390 DBG_PRINTF((category, RSM_ERR,
3391 "rsm:rsmpi unpublish err %x\n", e));
3392 seg->s_state = RSM_STATE_BIND;
3394 rsmseglock_release(seg);
3395 return (e);
3399 /* Free segment */
3400 e = adapter->rsmpi_ops->rsm_seg_destroy(seg->s_handle.out);
3402 if (e != RSM_SUCCESS) {
3403 DBG_PRINTF((category, RSM_ERR,
3404 "rsm_unpublish: rsmpi destroy key=%x failed %x\n",
3405 seg->s_key, e));
3408 bypass:
3409 acl = seg->s_acl;
3410 rsmpi_acl = seg->s_acl_in;
3411 acl_len = seg->s_acl_len;
3413 seg->s_acl = NULL;
3414 seg->s_acl_in = NULL;
3415 seg->s_acl_len = 0;
3417 if (seg->s_state == RSM_STATE_EXPORT) {
3418 seg->s_state = RSM_STATE_BIND;
3419 } else if (seg->s_state == RSM_STATE_EXPORT_QUIESCED) {
3420 seg->s_state = RSM_STATE_BIND_QUIESCED;
3421 cv_broadcast(&seg->s_cv);
3424 rsmseglock_release(seg);
3426 rsmacl_free(acl, acl_len);
3427 rsmpiacl_free(rsmpi_acl, acl_len);
3429 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unpublish done\n"));
3431 return (DDI_SUCCESS);
3435 * Called from rsm_unpublish to force an unload and disconnection of all
3436 * importers of the unpublished segment.
3438 * First build the list of segments requiring a force disconnect, then
3439 * send a request for each.
3441 static void
3442 rsm_send_importer_disconnects(rsm_memseg_id_t ex_segid,
3443 rsm_node_id_t ex_nodeid)
3445 rsmipc_request_t request;
3446 importing_token_t *prev_token, *token, *tmp_token, *tokp;
3447 importing_token_t *force_disconnect_list = NULL;
3448 int index;
3450 DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
3451 "rsm_send_importer_disconnects enter\n"));
3453 index = rsmhash(ex_segid);
3455 mutex_enter(&importer_list.lock);
3457 prev_token = NULL;
3458 token = importer_list.bucket[index];
3460 while (token != NULL) {
3461 if (token->key == ex_segid) {
3463 * take it off the importer list and add it
3464 * to the force disconnect list.
3466 if (prev_token == NULL)
3467 importer_list.bucket[index] = token->next;
3468 else
3469 prev_token->next = token->next;
3470 tmp_token = token;
3471 token = token->next;
3472 if (force_disconnect_list == NULL) {
3473 force_disconnect_list = tmp_token;
3474 tmp_token->next = NULL;
3475 } else {
3476 tokp = force_disconnect_list;
3478 * make sure that the tmp_token's node
3479 * is not already on the force disconnect
3480 * list.
3482 while (tokp != NULL) {
3483 if (tokp->importing_node ==
3484 tmp_token->importing_node) {
3485 break;
3487 tokp = tokp->next;
3489 if (tokp == NULL) {
3490 tmp_token->next =
3491 force_disconnect_list;
3492 force_disconnect_list = tmp_token;
3493 } else {
3494 kmem_free((void *)tmp_token,
3495 sizeof (*token));
3499 } else {
3500 prev_token = token;
3501 token = token->next;
3504 mutex_exit(&importer_list.lock);
3506 token = force_disconnect_list;
3507 while (token != NULL) {
3508 if (token->importing_node == my_nodeid) {
3509 rsm_force_unload(ex_nodeid, ex_segid,
3510 DISCONNECT);
3511 } else {
3512 request.rsmipc_hdr.rsmipc_type =
3513 RSMIPC_MSG_DISCONNECT;
3514 request.rsmipc_key = token->key;
3515 for (;;) {
3516 if (rsmipc_send(token->importing_node,
3517 &request,
3518 RSM_NO_REPLY) == RSM_SUCCESS) {
3519 break;
3520 } else {
3521 delay(drv_usectohz(10000));
3525 tmp_token = token;
3526 token = token->next;
3527 kmem_free((void *)tmp_token, sizeof (*token));
3530 DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
3531 "rsm_send_importer_disconnects done\n"));
3535 * This function is used as a callback for unlocking the pages locked
3536 * down by a process which then does a fork or an exec.
3537 * It marks the export segments corresponding to umem cookie given by
3538 * the *arg to be in a ZOMBIE state(by calling rsmseg_close to be
3539 * destroyed later when an rsm_close occurs).
3541 static void
3542 rsm_export_force_destroy(ddi_umem_cookie_t *ck)
3544 rsmresource_blk_t *blk;
3545 rsmresource_t *p;
3546 rsmseg_t *eseg = NULL;
3547 int i, j;
3548 int found = 0;
3550 DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
3551 "rsm_export_force_destroy enter\n"));
3554 * Walk the resource list and locate the export segment (either
3555 * in the BIND or the EXPORT state) which corresponds to the
3556 * ddi_umem_cookie_t being freed up, and call rsmseg_close.
3557 * Change the state to ZOMBIE by calling rsmseg_close with the
3558 * force_flag argument (the second argument) set to 1. Also,
3559 * unpublish and unbind the segment, but don't free it. Free it
3560 * only on a rsm_close call for the segment.
3562 rw_enter(&rsm_resource.rsmrc_lock, RW_READER);
3564 for (i = 0; i < rsm_resource.rsmrc_len; i++) {
3565 blk = rsm_resource.rsmrc_root[i];
3566 if (blk == NULL) {
3567 continue;
3570 for (j = 0; j < RSMRC_BLKSZ; j++) {
3571 p = blk->rsmrcblk_blks[j];
3572 if ((p != NULL) && (p != RSMRC_RESERVED) &&
3573 (p->rsmrc_type == RSM_RESOURCE_EXPORT_SEGMENT)) {
3574 eseg = (rsmseg_t *)p;
3575 if (eseg->s_cookie != ck)
3576 continue; /* continue searching */
3578 * Found the segment, set flag to indicate
3579 * force destroy processing is in progress
3581 rsmseglock_acquire(eseg);
3582 eseg->s_flags |= RSM_FORCE_DESTROY_WAIT;
3583 rsmseglock_release(eseg);
3584 found = 1;
3585 break;
3589 if (found)
3590 break;
3593 rw_exit(&rsm_resource.rsmrc_lock);
3595 if (found) {
3596 ASSERT(eseg != NULL);
3597 /* call rsmseg_close with force flag set to 1 */
3598 rsmseg_close(eseg, 1);
3600 * force destroy processing done, clear flag and signal any
3601 * thread waiting in rsmseg_close.
3603 rsmseglock_acquire(eseg);
3604 eseg->s_flags &= ~RSM_FORCE_DESTROY_WAIT;
3605 cv_broadcast(&eseg->s_cv);
3606 rsmseglock_release(eseg);
3609 DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
3610 "rsm_export_force_destroy done\n"));
3613 /* ******************************* Remote Calls *********************** */
3614 static void
3615 rsm_intr_segconnect(rsm_node_id_t src, rsmipc_request_t *req)
3617 rsmipc_reply_t reply;
3618 DBG_DEFINE(category,
3619 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3621 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3622 "rsm_intr_segconnect enter\n"));
3624 reply.rsmipc_status = (short)rsmsegacl_validate(req, src, &reply);
3626 reply.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_REPLY;
3627 reply.rsmipc_hdr.rsmipc_cookie = req->rsmipc_hdr.rsmipc_cookie;
3629 (void) rsmipc_send(src, NULL, &reply);
3631 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3632 "rsm_intr_segconnect done\n"));
3637 * When an exported segment is unpublished the exporter sends an ipc
3638 * message (RSMIPC_MSG_DISCONNECT) to all importers. The recv ipc dispatcher
3639 * calls this function. The import list is scanned; segments which match the
3640 * exported segment id are unloaded and disconnected.
3642 * Will also be called from rsm_rebind with disconnect_flag FALSE.
3645 static void
3646 rsm_force_unload(rsm_node_id_t src_nodeid, rsm_memseg_id_t ex_segid,
3647 boolean_t disconnect_flag)
3649 rsmresource_t *p = NULL;
3650 rsmhash_table_t *rhash = &rsm_import_segs;
3651 uint_t index;
3652 DBG_DEFINE(category,
3653 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3655 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_force_unload enter\n"));
3657 index = rsmhash(ex_segid);
3659 rw_enter(&rhash->rsmhash_rw, RW_READER);
3661 p = rsmhash_getbkt(rhash, index);
3663 for (; p; p = p->rsmrc_next) {
3664 rsmseg_t *seg = (rsmseg_t *)p;
3665 if ((seg->s_segid == ex_segid) && (seg->s_node == src_nodeid)) {
3667 * In order to make rsmseg_unload and rsm_force_unload
3668 * thread safe, acquire the segment lock here.
3669 * rsmseg_unload is responsible for releasing the lock.
3670 * rsmseg_unload releases the lock just before a call
3671 * to rsmipc_send or in case of an early exit which
3672 * occurs if the segment was in the state
3673 * RSM_STATE_CONNECTING or RSM_STATE_NEW.
3675 rsmseglock_acquire(seg);
3676 if (disconnect_flag)
3677 seg->s_flags |= RSM_FORCE_DISCONNECT;
3678 rsmseg_unload(seg);
3681 rw_exit(&rhash->rsmhash_rw);
3683 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_force_unload done\n"));
3686 static void
3687 rsm_intr_reply(rsmipc_msghdr_t *msg)
3690 * Find slot for cookie in reply.
3691 * Match sequence with sequence in cookie
3692 * If no match; return
3693 * Try to grap lock of slot, if locked return
3694 * copy data into reply slot area
3695 * signal waiter
3697 rsmipc_slot_t *slot;
3698 rsmipc_cookie_t *cookie;
3699 void *data = (void *) msg;
3700 size_t size = sizeof (rsmipc_reply_t);
3701 DBG_DEFINE(category,
3702 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3704 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_reply enter\n"));
3706 cookie = &msg->rsmipc_cookie;
3707 if (cookie->ic.index >= RSMIPC_SZ) {
3708 DBG_PRINTF((category, RSM_ERR,
3709 "rsm: rsm_intr_reply bad cookie %d\n", cookie->ic.index));
3710 return;
3713 ASSERT(cookie->ic.index < RSMIPC_SZ);
3714 slot = &rsm_ipc.slots[cookie->ic.index];
3715 mutex_enter(&slot->rsmipc_lock);
3716 if (slot->rsmipc_cookie.value == cookie->value) {
3717 /* found a match */
3718 if (RSMIPC_GET(slot, RSMIPC_PENDING)) {
3719 bcopy(data, slot->rsmipc_data, size);
3720 RSMIPC_CLEAR(slot, RSMIPC_PENDING);
3721 cv_signal(&slot->rsmipc_cv);
3723 } else {
3724 DBG_PRINTF((category, RSM_DEBUG,
3725 "rsm: rsm_intr_reply mismatched reply %d\n",
3726 cookie->ic.index));
3728 mutex_exit(&slot->rsmipc_lock);
3729 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_reply done\n"));
3733 * This function gets dispatched on the worker thread when we receive
3734 * the SQREADY message. This function sends the SQREADY_ACK message.
3736 static void
3737 rsm_sqready_ack_deferred(void *arg)
3739 path_t *path = (path_t *)arg;
3740 DBG_DEFINE(category,
3741 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3743 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3744 "rsm_sqready_ack_deferred enter\n"));
3746 mutex_enter(&path->mutex);
3749 * If path is not active no point in sending the ACK
3750 * because the whole SQREADY protocol will again start
3751 * when the path becomes active.
3753 if (path->state != RSMKA_PATH_ACTIVE) {
3755 * decrement the path refcnt incremented in rsm_proc_sqready
3757 PATH_RELE_NOLOCK(path);
3758 mutex_exit(&path->mutex);
3759 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3760 "rsm_sqready_ack_deferred done:!ACTIVE\n"));
3761 return;
3764 /* send an SQREADY_ACK message */
3765 (void) rsmipc_send_controlmsg(path, RSMIPC_MSG_SQREADY_ACK);
3767 /* initialize credits to the max level */
3768 path->sendq_token.msgbuf_avail = RSMIPC_MAX_MESSAGES;
3770 /* wake up any send that is waiting for credits */
3771 cv_broadcast(&path->sendq_token.sendq_cv);
3774 * decrement the path refcnt since we incremented it in
3775 * rsm_proc_sqready
3777 PATH_RELE_NOLOCK(path);
3779 mutex_exit(&path->mutex);
3781 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3782 "rsm_sqready_ack_deferred done\n"));
3786 * Process the SQREADY message
3788 static void
3789 rsm_proc_sqready(rsmipc_controlmsg_t *msg, rsm_addr_t src_hwaddr,
3790 rsm_intr_hand_arg_t arg)
3792 rsmipc_msghdr_t *msghdr = (rsmipc_msghdr_t *)msg;
3793 srv_handler_arg_t *hdlr_argp = (srv_handler_arg_t *)arg;
3794 path_t *path;
3795 DBG_DEFINE(category,
3796 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3798 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_proc_sqready enter\n"));
3800 /* look up the path - incr the path refcnt */
3801 path = rsm_find_path(hdlr_argp->adapter_name,
3802 hdlr_argp->adapter_instance, src_hwaddr);
3805 * No path exists or path is not active - drop the message
3807 if (path == NULL) {
3808 DBG_PRINTF((category, RSM_DEBUG,
3809 "rsm_proc_sqready done: msg dropped no path\n"));
3810 return;
3813 mutex_exit(&path->mutex);
3815 /* drain any tasks from the previous incarnation */
3816 taskq_wait(path->recv_taskq);
3818 mutex_enter(&path->mutex);
3820 * If we'd sent an SQREADY message and were waiting for SQREADY_ACK
3821 * in the meanwhile we received an SQREADY message, blindly reset
3822 * the WAIT_FOR_SQACK flag because we'll just send SQREADY_ACK
3823 * and forget about the SQREADY that we sent.
3825 path->flags &= ~RSMKA_WAIT_FOR_SQACK;
3827 if (path->state != RSMKA_PATH_ACTIVE) {
3828 /* decr refcnt and drop the mutex */
3829 PATH_RELE_NOLOCK(path);
3830 mutex_exit(&path->mutex);
3831 DBG_PRINTF((category, RSM_DEBUG,
3832 "rsm_proc_sqready done: msg dropped path !ACTIVE\n"));
3833 return;
3836 DBG_PRINTF((category, RSM_DEBUG, "rsm_proc_sqready:path=%lx "
3837 " src=%lx:%llx\n", path, msghdr->rsmipc_src, src_hwaddr));
3840 * The sender's local incarnation number is our remote incarnation
3841 * number save it in the path data structure
3843 path->remote_incn = msg->rsmipc_local_incn;
3844 path->sendq_token.msgbuf_avail = 0;
3845 path->procmsg_cnt = 0;
3848 * path is active - dispatch task to send SQREADY_ACK - remember
3849 * RSMPI calls can't be done in interrupt context
3851 * We can use the recv_taskq to send because the remote endpoint
3852 * cannot start sending messages till it receives SQREADY_ACK hence
3853 * at this point there are no tasks on recv_taskq.
3855 * The path refcnt will be decremented in rsm_sqready_ack_deferred.
3857 (void) taskq_dispatch(path->recv_taskq,
3858 rsm_sqready_ack_deferred, path, KM_NOSLEEP);
3860 mutex_exit(&path->mutex);
3863 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_proc_sqready done\n"));
3867 * Process the SQREADY_ACK message
3869 static void
3870 rsm_proc_sqready_ack(rsmipc_controlmsg_t *msg, rsm_addr_t src_hwaddr,
3871 rsm_intr_hand_arg_t arg)
3873 rsmipc_msghdr_t *msghdr = (rsmipc_msghdr_t *)msg;
3874 srv_handler_arg_t *hdlr_argp = (srv_handler_arg_t *)arg;
3875 path_t *path;
3876 DBG_DEFINE(category,
3877 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3879 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3880 "rsm_proc_sqready_ack enter\n"));
3882 /* look up the path - incr the path refcnt */
3883 path = rsm_find_path(hdlr_argp->adapter_name,
3884 hdlr_argp->adapter_instance, src_hwaddr);
3887 * drop the message if - no path exists or path is not active
3888 * or if its not waiting for SQREADY_ACK message
3890 if (path == NULL) {
3891 DBG_PRINTF((category, RSM_DEBUG,
3892 "rsm_proc_sqready_ack done: msg dropped no path\n"));
3893 return;
3896 if ((path->state != RSMKA_PATH_ACTIVE) ||
3897 !(path->flags & RSMKA_WAIT_FOR_SQACK)) {
3898 /* decrement the refcnt */
3899 PATH_RELE_NOLOCK(path);
3900 mutex_exit(&path->mutex);
3901 DBG_PRINTF((category, RSM_DEBUG,
3902 "rsm_proc_sqready_ack done: msg dropped\n"));
3903 return;
3907 * Check if this message is in response to the last RSMIPC_MSG_SQREADY
3908 * sent, if not drop it.
3910 if (path->local_incn != msghdr->rsmipc_incn) {
3911 /* decrement the refcnt */
3912 PATH_RELE_NOLOCK(path);
3913 mutex_exit(&path->mutex);
3914 DBG_PRINTF((category, RSM_DEBUG,
3915 "rsm_proc_sqready_ack done: msg old incn %lld\n",
3916 msghdr->rsmipc_incn));
3917 return;
3920 DBG_PRINTF((category, RSM_DEBUG, "rsm_proc_sqready_ack:path=%lx "
3921 " src=%lx:%llx\n", path, msghdr->rsmipc_src, src_hwaddr));
3924 * clear the WAIT_FOR_SQACK flag since we have recvd the ack
3926 path->flags &= ~RSMKA_WAIT_FOR_SQACK;
3928 /* save the remote sendq incn number */
3929 path->remote_incn = msg->rsmipc_local_incn;
3931 /* initialize credits to the max level */
3932 path->sendq_token.msgbuf_avail = RSMIPC_MAX_MESSAGES;
3934 /* wake up any send that is waiting for credits */
3935 cv_broadcast(&path->sendq_token.sendq_cv);
3937 /* decrement the refcnt */
3938 PATH_RELE_NOLOCK(path);
3940 mutex_exit(&path->mutex);
3942 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3943 "rsm_proc_sqready_ack done\n"));
3947 * process the RSMIPC_MSG_CREDIT message
3949 static void
3950 rsm_add_credits(rsmipc_controlmsg_t *msg, rsm_addr_t src_hwaddr,
3951 rsm_intr_hand_arg_t arg)
3953 rsmipc_msghdr_t *msghdr = (rsmipc_msghdr_t *)msg;
3954 srv_handler_arg_t *hdlr_argp = (srv_handler_arg_t *)arg;
3955 path_t *path;
3956 DBG_DEFINE(category,
3957 RSM_KERNEL_AGENT | RSM_FUNC_ALL |
3958 RSM_INTR_CALLBACK | RSM_FLOWCONTROL);
3960 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_add_credits enter\n"));
3962 /* look up the path - incr the path refcnt */
3963 path = rsm_find_path(hdlr_argp->adapter_name,
3964 hdlr_argp->adapter_instance, src_hwaddr);
3966 if (path == NULL) {
3967 DBG_PRINTF((category, RSM_DEBUG,
3968 "rsm_add_credits enter: path not found\n"));
3969 return;
3972 /* the path is not active - discard credits */
3973 if (path->state != RSMKA_PATH_ACTIVE) {
3974 PATH_RELE_NOLOCK(path);
3975 mutex_exit(&path->mutex);
3976 DBG_PRINTF((category, RSM_DEBUG,
3977 "rsm_add_credits enter:path=%lx !ACTIVE\n", path));
3978 return;
3982 * Check if these credits are for current incarnation of the path.
3984 if (path->local_incn != msghdr->rsmipc_incn) {
3985 /* decrement the refcnt */
3986 PATH_RELE_NOLOCK(path);
3987 mutex_exit(&path->mutex);
3988 DBG_PRINTF((category, RSM_DEBUG,
3989 "rsm_add_credits enter: old incn %lld\n",
3990 msghdr->rsmipc_incn));
3991 return;
3994 DBG_PRINTF((category, RSM_DEBUG,
3995 "rsm_add_credits:path=%lx new-creds=%d "
3996 "curr credits=%d src=%lx:%llx\n", path, msg->rsmipc_credits,
3997 path->sendq_token.msgbuf_avail, msghdr->rsmipc_src,
3998 src_hwaddr));
4001 /* add credits to the path's sendq */
4002 path->sendq_token.msgbuf_avail += msg->rsmipc_credits;
4004 ASSERT(path->sendq_token.msgbuf_avail <= RSMIPC_MAX_MESSAGES);
4006 /* wake up any send that is waiting for credits */
4007 cv_broadcast(&path->sendq_token.sendq_cv);
4009 /* decrement the refcnt */
4010 PATH_RELE_NOLOCK(path);
4012 mutex_exit(&path->mutex);
4014 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_add_credits done\n"));
4017 static void
4018 rsm_intr_event(rsmipc_request_t *msg)
4020 rsmseg_t *seg;
4021 rsmresource_t *p;
4022 rsm_node_id_t src_node;
4023 DBG_DEFINE(category,
4024 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4026 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_event enter\n"));
4028 src_node = msg->rsmipc_hdr.rsmipc_src;
4030 if ((seg = msg->rsmipc_segment_cookie) != NULL) {
4031 /* This is for an import segment */
4032 uint_t hashval = rsmhash(msg->rsmipc_key);
4034 rw_enter(&rsm_import_segs.rsmhash_rw, RW_READER);
4036 p = (rsmresource_t *)rsmhash_getbkt(&rsm_import_segs, hashval);
4038 for (; p; p = p->rsmrc_next) {
4039 if ((p->rsmrc_key == msg->rsmipc_key) &&
4040 (p->rsmrc_node == src_node)) {
4041 seg = (rsmseg_t *)p;
4042 rsmseglock_acquire(seg);
4044 atomic_inc_32(&seg->s_pollevent);
4046 if (seg->s_pollflag & RSM_SEGMENT_POLL)
4047 pollwakeup(&seg->s_poll, POLLRDNORM);
4049 rsmseglock_release(seg);
4053 rw_exit(&rsm_import_segs.rsmhash_rw);
4054 } else {
4055 /* This is for an export segment */
4056 seg = rsmexport_lookup(msg->rsmipc_key);
4057 if (!seg) {
4058 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4059 "rsm_intr_event done: exp seg not found\n"));
4060 return;
4063 ASSERT(rsmseglock_held(seg));
4065 atomic_inc_32(&seg->s_pollevent);
4068 * We must hold the segment lock here, or else the segment
4069 * can be freed while pollwakeup is using it. This implies
4070 * that we MUST NOT grab the segment lock during rsm_chpoll,
4071 * as outlined in the chpoll(2) man page.
4073 if (seg->s_pollflag & RSM_SEGMENT_POLL)
4074 pollwakeup(&seg->s_poll, POLLRDNORM);
4076 rsmseglock_release(seg);
4079 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_event done\n"));
4083 * The exporter did a republish and changed the ACL - this change is only
4084 * visible to new importers.
4086 static void
4087 importer_update(rsm_node_id_t src_node, rsm_memseg_id_t key,
4088 rsm_permission_t perm)
4091 rsmresource_t *p;
4092 rsmseg_t *seg;
4093 uint_t hashval = rsmhash(key);
4094 DBG_DEFINE(category,
4095 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4097 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_update enter\n"));
4099 rw_enter(&rsm_import_segs.rsmhash_rw, RW_READER);
4101 p = (rsmresource_t *)rsmhash_getbkt(&rsm_import_segs, hashval);
4103 for (; p; p = p->rsmrc_next) {
4105 * find the importer and update the permission in the shared
4106 * data structure. Any new importers will use the new perms
4108 if ((p->rsmrc_key == key) && (p->rsmrc_node == src_node)) {
4109 seg = (rsmseg_t *)p;
4111 rsmseglock_acquire(seg);
4112 rsmsharelock_acquire(seg);
4113 seg->s_share->rsmsi_mode = perm;
4114 rsmsharelock_release(seg);
4115 rsmseglock_release(seg);
4117 break;
4121 rw_exit(&rsm_import_segs.rsmhash_rw);
4123 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_update done\n"));
4126 void
4127 rsm_suspend_complete(rsm_node_id_t src_node, int flag)
4129 int done = 1; /* indicate all SUSPENDS have been acked */
4130 list_element_t *elem;
4131 DBG_DEFINE(category,
4132 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4134 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4135 "rsm_suspend_complete enter\n"));
4137 mutex_enter(&rsm_suspend_list.list_lock);
4139 if (rsm_suspend_list.list_head == NULL) {
4140 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4141 "rsm_suspend_complete done: suspend_list is empty\n"));
4142 mutex_exit(&rsm_suspend_list.list_lock);
4143 return;
4146 elem = rsm_suspend_list.list_head;
4147 while (elem != NULL) {
4148 if (elem->nodeid == src_node) {
4149 /* clear the pending flag for the node */
4150 elem->flags &= ~RSM_SUSPEND_ACKPENDING;
4151 elem->flags |= flag;
4154 if (done && (elem->flags & RSM_SUSPEND_ACKPENDING))
4155 done = 0; /* still some nodes have not yet ACKED */
4157 elem = elem->next;
4160 mutex_exit(&rsm_suspend_list.list_lock);
4162 if (!done) {
4163 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4164 "rsm_suspend_complete done: acks pending\n"));
4165 return;
4168 * Now that we are done with suspending all the remote importers
4169 * time to quiesce the local exporters
4171 exporter_quiesce();
4173 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4174 "rsm_suspend_complete done\n"));
4177 static void
4178 exporter_quiesce()
4180 int i, e;
4181 rsmresource_t *current;
4182 rsmseg_t *seg;
4183 adapter_t *adapter;
4184 DBG_DEFINE(category,
4185 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4187 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "exporter_quiesce enter\n"));
4189 * The importers send a SUSPEND_COMPLETE to the exporter node
4190 * Unpublish, unbind the export segment and
4191 * move the segments to the EXPORT_QUIESCED state
4194 rw_enter(&rsm_export_segs.rsmhash_rw, RW_READER);
4196 for (i = 0; i < rsm_hash_size; i++) {
4197 current = rsm_export_segs.bucket[i];
4198 while (current != NULL) {
4199 seg = (rsmseg_t *)current;
4200 rsmseglock_acquire(seg);
4201 if (current->rsmrc_state ==
4202 RSM_STATE_EXPORT_QUIESCING) {
4203 adapter = seg->s_adapter;
4205 * some local memory handles are not published
4206 * check if it was published
4208 if ((seg->s_acl == NULL) ||
4209 (seg->s_acl[0].ae_node != my_nodeid) ||
4210 (seg->s_acl[0].ae_permission != 0)) {
4212 e = adapter->rsmpi_ops->rsm_unpublish(
4213 seg->s_handle.out);
4214 DBG_PRINTF((category, RSM_DEBUG,
4215 "exporter_quiesce:unpub %d\n", e));
4217 e = adapter->rsmpi_ops->rsm_seg_destroy(
4218 seg->s_handle.out);
4220 DBG_PRINTF((category, RSM_DEBUG,
4221 "exporter_quiesce:destroy %d\n",
4222 e));
4225 (void) rsm_unbind_pages(seg);
4226 seg->s_state = RSM_STATE_EXPORT_QUIESCED;
4227 cv_broadcast(&seg->s_cv);
4229 rsmseglock_release(seg);
4230 current = current->rsmrc_next;
4233 rw_exit(&rsm_export_segs.rsmhash_rw);
4236 * All the local segments we are done with the pre-del processing
4237 * - time to move to PREDEL_COMPLETED.
4240 mutex_enter(&rsm_drv_data.drv_lock);
4242 ASSERT(rsm_drv_data.drv_state == RSM_DRV_PREDEL_STARTED);
4244 rsm_drv_data.drv_state = RSM_DRV_PREDEL_COMPLETED;
4246 cv_broadcast(&rsm_drv_data.drv_cv);
4248 mutex_exit(&rsm_drv_data.drv_lock);
4250 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "exporter_quiesce done\n"));
4253 static void
4254 importer_suspend(rsm_node_id_t src_node)
4256 int i;
4257 int susp_flg; /* true means already suspended */
4258 int num_importers;
4259 rsmresource_t *p = NULL, *curp;
4260 rsmhash_table_t *rhash = &rsm_import_segs;
4261 rsmseg_t *seg;
4262 rsmipc_request_t request;
4263 DBG_DEFINE(category,
4264 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4266 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_suspend enter\n"));
4268 rw_enter(&rhash->rsmhash_rw, RW_READER);
4269 for (i = 0; i < rsm_hash_size; i++) {
4270 p = rhash->bucket[i];
4273 * Suspend all importers with same <node, key> pair.
4274 * After the last one of the shared importers has been
4275 * suspended - suspend the shared mappings/connection.
4277 for (; p; p = p->rsmrc_next) {
4278 rsmseg_t *first = (rsmseg_t *)p;
4279 if ((first->s_node != src_node) ||
4280 (first->s_state == RSM_STATE_DISCONNECT))
4281 continue; /* go to next entry */
4283 * search the rest of the bucket for
4284 * other siblings (imprtrs with the same key)
4285 * of "first" and suspend them.
4286 * All importers with same key fall in
4287 * the same bucket.
4289 num_importers = 0;
4290 for (curp = p; curp; curp = curp->rsmrc_next) {
4291 seg = (rsmseg_t *)curp;
4293 rsmseglock_acquire(seg);
4295 if ((seg->s_node != first->s_node) ||
4296 (seg->s_key != first->s_key) ||
4297 (seg->s_state == RSM_STATE_DISCONNECT)) {
4299 * either not a peer segment or its a
4300 * disconnected segment - skip it
4302 rsmseglock_release(seg);
4303 continue;
4306 rsmseg_suspend(seg, &susp_flg);
4308 if (susp_flg) { /* seg already suspended */
4309 rsmseglock_release(seg);
4310 break; /* the inner for loop */
4313 num_importers++;
4314 rsmsharelock_acquire(seg);
4316 * we've processed all importers that are
4317 * siblings of "first"
4319 if (num_importers ==
4320 seg->s_share->rsmsi_refcnt) {
4321 rsmsharelock_release(seg);
4322 rsmseglock_release(seg);
4323 break;
4325 rsmsharelock_release(seg);
4326 rsmseglock_release(seg);
4330 * All the importers with the same key and
4331 * nodeid as "first" have been suspended.
4332 * Now suspend the shared connect/mapping.
4333 * This is done only once.
4335 if (!susp_flg) {
4336 rsmsegshare_suspend(seg);
4341 rw_exit(&rhash->rsmhash_rw);
4343 /* send an ACK for SUSPEND message */
4344 request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_SUSPEND_DONE;
4345 (void) rsmipc_send(src_node, &request, RSM_NO_REPLY);
4348 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_suspend done\n"));
4352 static void
4353 rsmseg_suspend(rsmseg_t *seg, int *susp_flg)
4355 int recheck_state;
4356 rsmcookie_t *hdl;
4357 DBG_DEFINE(category,
4358 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4360 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4361 "rsmseg_suspend enter: key=%u\n", seg->s_key));
4363 *susp_flg = 0;
4365 ASSERT(rsmseglock_held(seg));
4366 /* wait if putv/getv is in progress */
4367 while (seg->s_rdmacnt > 0)
4368 cv_wait(&seg->s_cv, &seg->s_lock);
4370 do {
4371 recheck_state = 0;
4373 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4374 "rsmseg_suspend:segment %x state=%d\n",
4375 seg->s_key, seg->s_state));
4377 switch (seg->s_state) {
4378 case RSM_STATE_NEW:
4379 /* not a valid state */
4380 break;
4381 case RSM_STATE_CONNECTING:
4382 seg->s_state = RSM_STATE_ABORT_CONNECT;
4383 break;
4384 case RSM_STATE_ABORT_CONNECT:
4385 break;
4386 case RSM_STATE_CONNECT:
4387 seg->s_handle.in = NULL;
4388 seg->s_state = RSM_STATE_CONN_QUIESCE;
4389 break;
4390 case RSM_STATE_MAPPING:
4391 /* wait until segment leaves the mapping state */
4392 while (seg->s_state == RSM_STATE_MAPPING)
4393 cv_wait(&seg->s_cv, &seg->s_lock);
4394 recheck_state = 1;
4395 break;
4396 case RSM_STATE_ACTIVE:
4397 /* unload the mappings */
4398 if (seg->s_ckl != NULL) {
4399 hdl = seg->s_ckl;
4400 for (; hdl != NULL; hdl = hdl->c_next) {
4401 (void) devmap_unload(hdl->c_dhp,
4402 hdl->c_off, hdl->c_len);
4405 seg->s_mapinfo = NULL;
4406 seg->s_state = RSM_STATE_MAP_QUIESCE;
4407 break;
4408 case RSM_STATE_CONN_QUIESCE:
4409 /* FALLTHRU */
4410 case RSM_STATE_MAP_QUIESCE:
4411 /* rsmseg_suspend already done for seg */
4412 *susp_flg = 1;
4413 break;
4414 case RSM_STATE_DISCONNECT:
4415 break;
4416 default:
4417 ASSERT(0); /* invalid state */
4419 } while (recheck_state);
4421 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_suspend done\n"));
4424 static void
4425 rsmsegshare_suspend(rsmseg_t *seg)
4427 int e;
4428 adapter_t *adapter;
4429 rsm_import_share_t *sharedp;
4430 DBG_DEFINE(category,
4431 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4433 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4434 "rsmsegshare_suspend enter\n"));
4436 rsmseglock_acquire(seg);
4437 rsmsharelock_acquire(seg);
4439 sharedp = seg->s_share;
4440 adapter = seg->s_adapter;
4441 switch (sharedp->rsmsi_state) {
4442 case RSMSI_STATE_NEW:
4443 break;
4444 case RSMSI_STATE_CONNECTING:
4445 sharedp->rsmsi_state = RSMSI_STATE_ABORT_CONNECT;
4446 break;
4447 case RSMSI_STATE_ABORT_CONNECT:
4448 break;
4449 case RSMSI_STATE_CONNECTED:
4450 /* do the rsmpi disconnect */
4451 if (sharedp->rsmsi_node != my_nodeid) {
4452 e = adapter->rsmpi_ops->
4453 rsm_disconnect(sharedp->rsmsi_handle);
4455 DBG_PRINTF((category, RSM_DEBUG,
4456 "rsm:rsmpi disconnect seg=%x:err=%d\n",
4457 sharedp->rsmsi_segid, e));
4460 sharedp->rsmsi_handle = NULL;
4462 sharedp->rsmsi_state = RSMSI_STATE_CONN_QUIESCE;
4463 break;
4464 case RSMSI_STATE_CONN_QUIESCE:
4465 break;
4466 case RSMSI_STATE_MAPPED:
4467 /* do the rsmpi unmap and disconnect */
4468 if (sharedp->rsmsi_node != my_nodeid) {
4469 e = adapter->rsmpi_ops->rsm_unmap(seg->s_handle.in);
4471 DBG_PRINTF((category, RSM_DEBUG,
4472 "rsmshare_suspend: rsmpi unmap %d\n", e));
4474 e = adapter->rsmpi_ops->
4475 rsm_disconnect(sharedp->rsmsi_handle);
4476 DBG_PRINTF((category, RSM_DEBUG,
4477 "rsm:rsmpi disconnect seg=%x:err=%d\n",
4478 sharedp->rsmsi_segid, e));
4481 sharedp->rsmsi_handle = NULL;
4483 sharedp->rsmsi_state = RSMSI_STATE_MAP_QUIESCE;
4484 break;
4485 case RSMSI_STATE_MAP_QUIESCE:
4486 break;
4487 case RSMSI_STATE_DISCONNECTED:
4488 break;
4489 default:
4490 ASSERT(0); /* invalid state */
4493 rsmsharelock_release(seg);
4494 rsmseglock_release(seg);
4496 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4497 "rsmsegshare_suspend done\n"));
4501 * This should get called on receiving a RESUME message or from
4502 * the pathmanger if the node undergoing DR dies.
4504 static void
4505 importer_resume(rsm_node_id_t src_node)
4507 int i;
4508 rsmresource_t *p = NULL;
4509 rsmhash_table_t *rhash = &rsm_import_segs;
4510 void *cookie;
4511 DBG_DEFINE(category,
4512 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4514 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_resume enter\n"));
4516 rw_enter(&rhash->rsmhash_rw, RW_READER);
4518 for (i = 0; i < rsm_hash_size; i++) {
4519 p = rhash->bucket[i];
4521 for (; p; p = p->rsmrc_next) {
4522 rsmseg_t *seg = (rsmseg_t *)p;
4524 rsmseglock_acquire(seg);
4526 /* process only importers of node undergoing DR */
4527 if (seg->s_node != src_node) {
4528 rsmseglock_release(seg);
4529 continue;
4532 if (rsmseg_resume(seg, &cookie) != RSM_SUCCESS) {
4533 rsmipc_request_t request;
4535 * rsmpi map/connect failed
4536 * inform the exporter so that it can
4537 * remove the importer.
4539 request.rsmipc_hdr.rsmipc_type =
4540 RSMIPC_MSG_NOTIMPORTING;
4541 request.rsmipc_key = seg->s_segid;
4542 request.rsmipc_segment_cookie = cookie;
4543 rsmseglock_release(seg);
4544 (void) rsmipc_send(seg->s_node, &request,
4545 RSM_NO_REPLY);
4546 } else {
4547 rsmseglock_release(seg);
4552 rw_exit(&rhash->rsmhash_rw);
4554 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_resume done\n"));
4557 static int
4558 rsmseg_resume(rsmseg_t *seg, void **cookie)
4560 int e;
4561 int retc;
4562 off_t dev_offset;
4563 size_t maplen;
4564 uint_t maxprot;
4565 rsm_mapinfo_t *p;
4566 rsmcookie_t *hdl;
4567 rsm_import_share_t *sharedp;
4568 DBG_DEFINE(category,
4569 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4571 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4572 "rsmseg_resume enter: key=%u\n", seg->s_key));
4574 *cookie = NULL;
4576 ASSERT(rsmseglock_held(seg));
4578 if ((seg->s_state != RSM_STATE_CONN_QUIESCE) &&
4579 (seg->s_state != RSM_STATE_MAP_QUIESCE)) {
4580 return (RSM_SUCCESS);
4583 sharedp = seg->s_share;
4585 rsmsharelock_acquire(seg);
4587 /* resume the shared connection and/or mapping */
4588 retc = rsmsegshare_resume(seg);
4590 if (seg->s_state == RSM_STATE_CONN_QUIESCE) {
4591 /* shared state can either be connected or mapped */
4592 if ((sharedp->rsmsi_state == RSMSI_STATE_CONNECTED) ||
4593 (sharedp->rsmsi_state == RSMSI_STATE_MAPPED)) {
4594 ASSERT(retc == RSM_SUCCESS);
4595 seg->s_handle.in = sharedp->rsmsi_handle;
4596 rsmsharelock_release(seg);
4597 seg->s_state = RSM_STATE_CONNECT;
4599 } else { /* error in rsmpi connect during resume */
4600 seg->s_handle.in = NULL;
4601 seg->s_state = RSM_STATE_DISCONNECT;
4603 sharedp->rsmsi_refcnt--;
4604 cookie = (void *)sharedp->rsmsi_cookie;
4606 if (sharedp->rsmsi_refcnt == 0) {
4607 ASSERT(sharedp->rsmsi_mapcnt == 0);
4608 rsmsharelock_release(seg);
4610 /* clean up the shared data structure */
4611 mutex_destroy(&sharedp->rsmsi_lock);
4612 cv_destroy(&sharedp->rsmsi_cv);
4613 kmem_free((void *)(sharedp),
4614 sizeof (rsm_import_share_t));
4616 } else {
4617 rsmsharelock_release(seg);
4620 * The following needs to be done after any
4621 * rsmsharelock calls which use seg->s_share.
4623 seg->s_share = NULL;
4626 /* signal any waiting segment */
4627 cv_broadcast(&seg->s_cv);
4629 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4630 "rsmseg_resume done:state=%d\n", seg->s_state));
4631 return (retc);
4634 ASSERT(seg->s_state == RSM_STATE_MAP_QUIESCE);
4636 /* Setup protections for remap */
4637 maxprot = PROT_USER;
4638 if (seg->s_mode & RSM_PERM_READ) {
4639 maxprot |= PROT_READ;
4641 if (seg->s_mode & RSM_PERM_WRITE) {
4642 maxprot |= PROT_WRITE;
4645 if (sharedp->rsmsi_state != RSMSI_STATE_MAPPED) {
4646 /* error in rsmpi connect or map during resume */
4648 /* remap to trash page */
4649 ASSERT(seg->s_ckl != NULL);
4651 for (hdl = seg->s_ckl; hdl != NULL; hdl = hdl->c_next) {
4652 e = devmap_umem_remap(hdl->c_dhp, rsm_dip,
4653 remap_cookie, hdl->c_off, hdl->c_len,
4654 maxprot, 0, NULL);
4656 DBG_PRINTF((category, RSM_ERR,
4657 "rsmseg_resume:remap=%d\n", e));
4660 seg->s_handle.in = NULL;
4661 seg->s_state = RSM_STATE_DISCONNECT;
4663 sharedp->rsmsi_refcnt--;
4665 sharedp->rsmsi_mapcnt--;
4666 seg->s_mapinfo = NULL;
4668 if (sharedp->rsmsi_refcnt == 0) {
4669 ASSERT(sharedp->rsmsi_mapcnt == 0);
4670 rsmsharelock_release(seg);
4672 /* clean up the shared data structure */
4673 mutex_destroy(&sharedp->rsmsi_lock);
4674 cv_destroy(&sharedp->rsmsi_cv);
4675 kmem_free((void *)(sharedp),
4676 sizeof (rsm_import_share_t));
4678 } else {
4679 rsmsharelock_release(seg);
4682 * The following needs to be done after any
4683 * rsmsharelock calls which use seg->s_share.
4685 seg->s_share = NULL;
4687 /* signal any waiting segment */
4688 cv_broadcast(&seg->s_cv);
4690 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4691 "rsmseg_resume done:seg=%x,err=%d\n",
4692 seg->s_key, retc));
4693 return (retc);
4697 seg->s_handle.in = sharedp->rsmsi_handle;
4699 if (seg->s_node == my_nodeid) { /* loopback */
4700 ASSERT(seg->s_mapinfo == NULL);
4702 for (hdl = seg->s_ckl; hdl != NULL; hdl = hdl->c_next) {
4703 e = devmap_umem_remap(hdl->c_dhp,
4704 rsm_dip, seg->s_cookie,
4705 hdl->c_off, hdl->c_len,
4706 maxprot, 0, NULL);
4708 DBG_PRINTF((category, RSM_ERR,
4709 "rsmseg_resume:remap=%d\n", e));
4711 } else { /* remote exporter */
4712 /* remap to the new rsmpi maps */
4713 seg->s_mapinfo = sharedp->rsmsi_mapinfo;
4715 for (hdl = seg->s_ckl; hdl != NULL; hdl = hdl->c_next) {
4716 p = rsm_get_mapinfo(seg, hdl->c_off, hdl->c_len,
4717 &dev_offset, &maplen);
4718 e = devmap_devmem_remap(hdl->c_dhp,
4719 p->dip, p->dev_register, dev_offset,
4720 maplen, maxprot, 0, NULL);
4722 DBG_PRINTF((category, RSM_ERR,
4723 "rsmseg_resume:remap=%d\n", e));
4727 rsmsharelock_release(seg);
4729 seg->s_state = RSM_STATE_ACTIVE;
4730 cv_broadcast(&seg->s_cv);
4732 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_resume done\n"));
4734 return (retc);
4737 static int
4738 rsmsegshare_resume(rsmseg_t *seg)
4740 int e = RSM_SUCCESS;
4741 adapter_t *adapter;
4742 rsm_import_share_t *sharedp;
4743 DBG_DEFINE(category,
4744 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4746 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmsegshare_resume enter\n"));
4748 ASSERT(rsmseglock_held(seg));
4749 ASSERT(rsmsharelock_held(seg));
4751 sharedp = seg->s_share;
4754 * If we are not in a xxxx_QUIESCE state that means shared
4755 * connect/mapping processing has been already been done
4756 * so return success.
4758 if ((sharedp->rsmsi_state != RSMSI_STATE_CONN_QUIESCE) &&
4759 (sharedp->rsmsi_state != RSMSI_STATE_MAP_QUIESCE)) {
4760 return (RSM_SUCCESS);
4763 adapter = seg->s_adapter;
4765 if (sharedp->rsmsi_node != my_nodeid) {
4766 rsm_addr_t hwaddr;
4767 hwaddr = get_remote_hwaddr(adapter, sharedp->rsmsi_node);
4769 e = adapter->rsmpi_ops->rsm_connect(
4770 adapter->rsmpi_handle, hwaddr,
4771 sharedp->rsmsi_segid, &sharedp->rsmsi_handle);
4773 DBG_PRINTF((category, RSM_DEBUG,
4774 "rsmsegshare_resume:rsmpi connect seg=%x:err=%d\n",
4775 sharedp->rsmsi_segid, e));
4777 if (e != RSM_SUCCESS) {
4778 /* when do we send the NOT_IMPORTING message */
4779 sharedp->rsmsi_handle = NULL;
4780 sharedp->rsmsi_state = RSMSI_STATE_DISCONNECTED;
4781 /* signal any waiting segment */
4782 cv_broadcast(&sharedp->rsmsi_cv);
4783 return (e);
4787 if (sharedp->rsmsi_state == RSMSI_STATE_CONN_QUIESCE) {
4788 sharedp->rsmsi_state = RSMSI_STATE_CONNECTED;
4789 /* signal any waiting segment */
4790 cv_broadcast(&sharedp->rsmsi_cv);
4791 return (e);
4794 ASSERT(sharedp->rsmsi_state == RSMSI_STATE_MAP_QUIESCE);
4796 /* do the rsmpi map of the whole segment here */
4797 if (sharedp->rsmsi_node != my_nodeid) {
4798 size_t mapped_len;
4799 rsm_mapinfo_t *p;
4802 * We need to do rsmpi maps with <off, lens> identical to
4803 * the old mapinfo list because the segment mapping handles
4804 * dhp and such need the fragmentation of rsmpi maps to be
4805 * identical to what it was during the mmap of the segment
4807 p = sharedp->rsmsi_mapinfo;
4809 while (p != NULL) {
4810 mapped_len = 0;
4812 e = adapter->rsmpi_ops->rsm_map(
4813 sharedp->rsmsi_handle, p->start_offset,
4814 p->individual_len, &mapped_len,
4815 &p->dip, &p->dev_register, &p->dev_offset,
4816 NULL, NULL);
4818 if (e != 0) {
4819 DBG_PRINTF((category, RSM_ERR,
4820 "rsmsegshare_resume: rsmpi map err=%d\n",
4821 e));
4822 break;
4825 if (mapped_len != p->individual_len) {
4826 DBG_PRINTF((category, RSM_ERR,
4827 "rsmsegshare_resume: rsmpi maplen"
4828 "< reqlen=%lx\n", mapped_len));
4829 e = RSMERR_BAD_LENGTH;
4830 break;
4833 p = p->next;
4838 if (e != RSM_SUCCESS) { /* rsmpi map failed */
4839 int err;
4840 /* Check if this is the first rsm_map */
4841 if (p != sharedp->rsmsi_mapinfo) {
4843 * A single rsm_unmap undoes multiple rsm_maps.
4845 (void) seg->s_adapter->rsmpi_ops->
4846 rsm_unmap(sharedp->rsmsi_handle);
4849 rsm_free_mapinfo(sharedp->rsmsi_mapinfo);
4850 sharedp->rsmsi_mapinfo = NULL;
4852 err = adapter->rsmpi_ops->
4853 rsm_disconnect(sharedp->rsmsi_handle);
4855 DBG_PRINTF((category, RSM_DEBUG,
4856 "rsmsegshare_resume:disconn seg=%x:err=%d\n",
4857 sharedp->rsmsi_segid, err));
4859 sharedp->rsmsi_handle = NULL;
4860 sharedp->rsmsi_state = RSMSI_STATE_DISCONNECTED;
4862 /* signal the waiting segments */
4863 cv_broadcast(&sharedp->rsmsi_cv);
4864 DBG_PRINTF((category, RSM_DEBUG,
4865 "rsmsegshare_resume done: rsmpi map err\n"));
4866 return (e);
4870 sharedp->rsmsi_state = RSMSI_STATE_MAPPED;
4872 /* signal any waiting segment */
4873 cv_broadcast(&sharedp->rsmsi_cv);
4875 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmsegshare_resume done\n"));
4877 return (e);
4881 * this is the routine that gets called by recv_taskq which is the
4882 * thread that processes messages that are flow-controlled.
4884 static void
4885 rsm_intr_proc_deferred(void *arg)
4887 path_t *path = (path_t *)arg;
4888 rsmipc_request_t *msg;
4889 rsmipc_msghdr_t *msghdr;
4890 rsm_node_id_t src_node;
4891 msgbuf_elem_t *head;
4892 int e;
4893 DBG_DEFINE(category,
4894 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4896 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4897 "rsm_intr_proc_deferred enter\n"));
4899 mutex_enter(&path->mutex);
4901 /* use the head of the msgbuf_queue */
4902 head = rsmka_gethead_msgbuf(path);
4904 mutex_exit(&path->mutex);
4906 msg = (rsmipc_request_t *)&(head->msg);
4907 msghdr = (rsmipc_msghdr_t *)msg;
4909 src_node = msghdr->rsmipc_src;
4912 * messages that need to send a reply should check the message version
4913 * before processing the message. And all messages that need to
4914 * send a reply should be processed here by the worker thread.
4916 switch (msghdr->rsmipc_type) {
4917 case RSMIPC_MSG_SEGCONNECT:
4918 if (msghdr->rsmipc_version != RSM_VERSION) {
4919 rsmipc_reply_t reply;
4920 reply.rsmipc_status = RSMERR_BAD_DRIVER_VERSION;
4921 reply.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_REPLY;
4922 reply.rsmipc_hdr.rsmipc_cookie = msghdr->rsmipc_cookie;
4923 (void) rsmipc_send(msghdr->rsmipc_src, NULL, &reply);
4924 } else {
4925 rsm_intr_segconnect(src_node, msg);
4927 break;
4928 case RSMIPC_MSG_DISCONNECT:
4929 rsm_force_unload(src_node, msg->rsmipc_key, DISCONNECT);
4930 break;
4931 case RSMIPC_MSG_SUSPEND:
4932 importer_suspend(src_node);
4933 break;
4934 case RSMIPC_MSG_SUSPEND_DONE:
4935 rsm_suspend_complete(src_node, 0);
4936 break;
4937 case RSMIPC_MSG_RESUME:
4938 importer_resume(src_node);
4939 break;
4940 default:
4941 ASSERT(0);
4944 mutex_enter(&path->mutex);
4946 rsmka_dequeue_msgbuf(path);
4948 /* incr procmsg_cnt can be at most RSMIPC_MAX_MESSAGES */
4949 if (path->procmsg_cnt < RSMIPC_MAX_MESSAGES)
4950 path->procmsg_cnt++;
4952 ASSERT(path->procmsg_cnt <= RSMIPC_MAX_MESSAGES);
4954 /* No need to send credits if path is going down */
4955 if ((path->state == RSMKA_PATH_ACTIVE) &&
4956 (path->procmsg_cnt >= RSMIPC_LOTSFREE_MSGBUFS)) {
4958 * send credits and reset procmsg_cnt if success otherwise
4959 * credits will be sent after processing the next message
4961 e = rsmipc_send_controlmsg(path, RSMIPC_MSG_CREDIT);
4962 if (e == 0)
4963 path->procmsg_cnt = 0;
4964 else
4965 DBG_PRINTF((category, RSM_ERR,
4966 "rsm_intr_proc_deferred:send credits err=%d\n", e));
4970 * decrement the path refcnt since we incremented it in
4971 * rsm_intr_callback_dispatch
4973 PATH_RELE_NOLOCK(path);
4975 mutex_exit(&path->mutex);
4977 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4978 "rsm_intr_proc_deferred done\n"));
4982 * Flow-controlled messages are enqueued and dispatched onto a taskq here
4984 static void
4985 rsm_intr_callback_dispatch(void *data, rsm_addr_t src_hwaddr,
4986 rsm_intr_hand_arg_t arg)
4988 srv_handler_arg_t *hdlr_argp = (srv_handler_arg_t *)arg;
4989 path_t *path;
4990 rsmipc_msghdr_t *msghdr = (rsmipc_msghdr_t *)data;
4991 DBG_DEFINE(category,
4992 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4994 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4995 "rsm_intr_callback_dispatch enter\n"));
4996 ASSERT(data && hdlr_argp);
4998 /* look up the path - incr the path refcnt */
4999 path = rsm_find_path(hdlr_argp->adapter_name,
5000 hdlr_argp->adapter_instance, src_hwaddr);
5002 /* the path has been removed - drop this message */
5003 if (path == NULL) {
5004 DBG_PRINTF((category, RSM_DEBUG,
5005 "rsm_intr_callback_dispatch done: msg dropped\n"));
5006 return;
5008 /* the path is not active - don't accept new messages */
5009 if (path->state != RSMKA_PATH_ACTIVE) {
5010 PATH_RELE_NOLOCK(path);
5011 mutex_exit(&path->mutex);
5012 DBG_PRINTF((category, RSM_DEBUG,
5013 "rsm_intr_callback_dispatch done: msg dropped"
5014 " path=%lx !ACTIVE\n", path));
5015 return;
5019 * Check if this message was sent to an older incarnation
5020 * of the path/sendq.
5022 if (path->local_incn != msghdr->rsmipc_incn) {
5023 /* decrement the refcnt */
5024 PATH_RELE_NOLOCK(path);
5025 mutex_exit(&path->mutex);
5026 DBG_PRINTF((category, RSM_DEBUG,
5027 "rsm_intr_callback_dispatch done: old incn %lld\n",
5028 msghdr->rsmipc_incn));
5029 return;
5032 /* copy and enqueue msg on the path's msgbuf queue */
5033 rsmka_enqueue_msgbuf(path, data);
5036 * schedule task to process messages - ignore retval from
5037 * task_dispatch because we sender cannot send more than
5038 * what receiver can handle.
5040 (void) taskq_dispatch(path->recv_taskq,
5041 rsm_intr_proc_deferred, path, KM_NOSLEEP);
5043 mutex_exit(&path->mutex);
5045 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5046 "rsm_intr_callback_dispatch done\n"));
5050 * This procedure is called from rsm_srv_func when a remote node creates a
5051 * a send queue. This event is used as a hint that an earlier failed
5052 * attempt to create a send queue to that remote node may now succeed and
5053 * should be retried. Indication of an earlier failed attempt is provided
5054 * by the RSMKA_SQCREATE_PENDING flag.
5056 static void
5057 rsm_sqcreateop_callback(rsm_addr_t src_hwaddr, rsm_intr_hand_arg_t arg)
5059 srv_handler_arg_t *hdlr_argp = (srv_handler_arg_t *)arg;
5060 path_t *path;
5061 DBG_DEFINE(category,
5062 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
5064 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5065 "rsm_sqcreateop_callback enter\n"));
5067 /* look up the path - incr the path refcnt */
5068 path = rsm_find_path(hdlr_argp->adapter_name,
5069 hdlr_argp->adapter_instance, src_hwaddr);
5071 if (path == NULL) {
5072 DBG_PRINTF((category, RSM_DEBUG,
5073 "rsm_sqcreateop_callback done: no path\n"));
5074 return;
5077 if ((path->state == RSMKA_PATH_UP) &&
5078 (path->flags & RSMKA_SQCREATE_PENDING)) {
5080 * previous attempt to create sendq had failed, retry
5081 * it and move to RSMKA_PATH_ACTIVE state if successful.
5082 * the refcnt will be decremented in the do_deferred_work
5084 (void) rsmka_do_path_active(path, RSMKA_NO_SLEEP);
5085 } else {
5086 /* decrement the refcnt */
5087 PATH_RELE_NOLOCK(path);
5089 mutex_exit(&path->mutex);
5091 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5092 "rsm_sqcreateop_callback done\n"));
5095 static void
5096 rsm_intr_callback(void *data, rsm_addr_t src_hwaddr, rsm_intr_hand_arg_t arg)
5098 rsmipc_msghdr_t *msghdr = (rsmipc_msghdr_t *)data;
5099 rsmipc_request_t *msg = (rsmipc_request_t *)data;
5100 rsmipc_controlmsg_t *ctrlmsg = (rsmipc_controlmsg_t *)data;
5101 rsm_node_id_t src_node;
5102 DBG_DEFINE(category,
5103 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
5105 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_callback enter:"
5106 "src=%d, type=%d\n", msghdr->rsmipc_src,
5107 msghdr->rsmipc_type));
5110 * Check for the version number in the msg header. If it is not
5111 * RSM_VERSION, drop the message. In the future, we need to manage
5112 * incompatible version numbers in some way
5114 if (msghdr->rsmipc_version != RSM_VERSION) {
5115 DBG_PRINTF((category, RSM_ERR, "wrong KA version\n"));
5117 * Drop requests that don't have a reply right here
5118 * Request with reply will send a BAD_VERSION reply
5119 * when they get processed by the worker thread.
5121 if (msghdr->rsmipc_type != RSMIPC_MSG_SEGCONNECT) {
5122 return;
5127 src_node = msghdr->rsmipc_src;
5129 switch (msghdr->rsmipc_type) {
5130 case RSMIPC_MSG_SEGCONNECT:
5131 case RSMIPC_MSG_DISCONNECT:
5132 case RSMIPC_MSG_SUSPEND:
5133 case RSMIPC_MSG_SUSPEND_DONE:
5134 case RSMIPC_MSG_RESUME:
5136 * These message types are handled by a worker thread using
5137 * the flow-control algorithm.
5138 * Any message processing that does one or more of the
5139 * following should be handled in a worker thread.
5140 * - allocates resources and might sleep
5141 * - makes RSMPI calls down to the interconnect driver
5142 * this by defn include requests with reply.
5143 * - takes a long duration of time
5145 rsm_intr_callback_dispatch(data, src_hwaddr, arg);
5146 break;
5147 case RSMIPC_MSG_NOTIMPORTING:
5148 importer_list_rm(src_node, msg->rsmipc_key,
5149 msg->rsmipc_segment_cookie);
5150 break;
5151 case RSMIPC_MSG_SQREADY:
5152 rsm_proc_sqready(data, src_hwaddr, arg);
5153 break;
5154 case RSMIPC_MSG_SQREADY_ACK:
5155 rsm_proc_sqready_ack(data, src_hwaddr, arg);
5156 break;
5157 case RSMIPC_MSG_CREDIT:
5158 rsm_add_credits(ctrlmsg, src_hwaddr, arg);
5159 break;
5160 case RSMIPC_MSG_REPLY:
5161 rsm_intr_reply(msghdr);
5162 break;
5163 case RSMIPC_MSG_BELL:
5164 rsm_intr_event(msg);
5165 break;
5166 case RSMIPC_MSG_IMPORTING:
5167 importer_list_add(src_node, msg->rsmipc_key,
5168 msg->rsmipc_adapter_hwaddr,
5169 msg->rsmipc_segment_cookie);
5170 break;
5171 case RSMIPC_MSG_REPUBLISH:
5172 importer_update(src_node, msg->rsmipc_key, msg->rsmipc_perm);
5173 break;
5174 default:
5175 DBG_PRINTF((category, RSM_DEBUG,
5176 "rsm_intr_callback: bad msg %lx type %d data %lx\n",
5177 (size_t)msg, (int)(msghdr->rsmipc_type), (size_t)data));
5180 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_callback done\n"));
5184 rsm_intr_hand_ret_t rsm_srv_func(rsm_controller_object_t *chd,
5185 rsm_intr_q_op_t opcode, rsm_addr_t src,
5186 void *data, size_t size, rsm_intr_hand_arg_t arg)
5188 DBG_DEFINE(category,
5189 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
5191 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_srv_func enter\n"));
5193 switch (opcode) {
5194 case RSM_INTR_Q_OP_CREATE:
5195 DBG_PRINTF((category, RSM_DEBUG, "rsm_srv_func:OP_CREATE\n"));
5196 rsm_sqcreateop_callback(src, arg);
5197 break;
5198 case RSM_INTR_Q_OP_DESTROY:
5199 DBG_PRINTF((category, RSM_DEBUG, "rsm_srv_func:OP_DESTROY\n"));
5200 break;
5201 case RSM_INTR_Q_OP_RECEIVE:
5202 rsm_intr_callback(data, src, arg);
5203 break;
5204 default:
5205 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5206 "rsm_srv_func: unknown opcode = %x\n", opcode));
5209 chd = chd;
5210 size = size;
5212 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_srv_func done\n"));
5214 return (RSM_INTR_HAND_CLAIMED);
5217 /* *************************** IPC slots ************************* */
5218 static rsmipc_slot_t *
5219 rsmipc_alloc()
5221 int i;
5222 rsmipc_slot_t *slot;
5223 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
5225 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_alloc enter\n"));
5227 /* try to find a free slot, if not wait */
5228 mutex_enter(&rsm_ipc.lock);
5230 while (rsm_ipc.count == 0) {
5231 rsm_ipc.wanted = 1;
5232 cv_wait(&rsm_ipc.cv, &rsm_ipc.lock);
5235 /* An empty slot is available, find it */
5236 slot = &rsm_ipc.slots[0];
5237 for (i = 0; i < RSMIPC_SZ; i++, slot++) {
5238 if (RSMIPC_GET(slot, RSMIPC_FREE)) {
5239 RSMIPC_CLEAR(slot, RSMIPC_FREE);
5240 break;
5244 ASSERT(i < RSMIPC_SZ);
5245 rsm_ipc.count--; /* one less is available */
5246 rsm_ipc.sequence++; /* new sequence */
5248 slot->rsmipc_cookie.ic.sequence = (uint_t)rsm_ipc.sequence;
5249 slot->rsmipc_cookie.ic.index = (uint_t)i;
5251 mutex_exit(&rsm_ipc.lock);
5253 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_alloc done\n"));
5255 return (slot);
5258 static void
5259 rsmipc_free(rsmipc_slot_t *slot)
5261 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
5263 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_free enter\n"));
5265 ASSERT(MUTEX_HELD(&slot->rsmipc_lock));
5266 ASSERT(&rsm_ipc.slots[slot->rsmipc_cookie.ic.index] == slot);
5268 mutex_enter(&rsm_ipc.lock);
5270 RSMIPC_SET(slot, RSMIPC_FREE);
5272 slot->rsmipc_cookie.ic.sequence = 0;
5274 mutex_exit(&slot->rsmipc_lock);
5275 rsm_ipc.count++;
5276 ASSERT(rsm_ipc.count <= RSMIPC_SZ);
5277 if (rsm_ipc.wanted) {
5278 rsm_ipc.wanted = 0;
5279 cv_broadcast(&rsm_ipc.cv);
5282 mutex_exit(&rsm_ipc.lock);
5284 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_free done\n"));
5287 static int
5288 rsmipc_send(rsm_node_id_t dest, rsmipc_request_t *req, rsmipc_reply_t *reply)
5290 int e = 0;
5291 int credit_check = 0;
5292 int retry_cnt = 0;
5293 int min_retry_cnt = 10;
5294 rsm_send_t is;
5295 rsmipc_slot_t *rslot;
5296 adapter_t *adapter;
5297 path_t *path;
5298 sendq_token_t *sendq_token;
5299 sendq_token_t *used_sendq_token = NULL;
5300 rsm_send_q_handle_t ipc_handle;
5301 DBG_DEFINE(category,
5302 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
5304 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_send enter:dest=%d",
5305 dest));
5308 * Check if this is a local case
5310 if (dest == my_nodeid) {
5311 switch (req->rsmipc_hdr.rsmipc_type) {
5312 case RSMIPC_MSG_SEGCONNECT:
5313 reply->rsmipc_status = (short)rsmsegacl_validate(
5314 req, dest, reply);
5315 break;
5316 case RSMIPC_MSG_BELL:
5317 req->rsmipc_hdr.rsmipc_src = dest;
5318 rsm_intr_event(req);
5319 break;
5320 case RSMIPC_MSG_IMPORTING:
5321 importer_list_add(dest, req->rsmipc_key,
5322 req->rsmipc_adapter_hwaddr,
5323 req->rsmipc_segment_cookie);
5324 break;
5325 case RSMIPC_MSG_NOTIMPORTING:
5326 importer_list_rm(dest, req->rsmipc_key,
5327 req->rsmipc_segment_cookie);
5328 break;
5329 case RSMIPC_MSG_REPUBLISH:
5330 importer_update(dest, req->rsmipc_key,
5331 req->rsmipc_perm);
5332 break;
5333 case RSMIPC_MSG_SUSPEND:
5334 importer_suspend(dest);
5335 break;
5336 case RSMIPC_MSG_SUSPEND_DONE:
5337 rsm_suspend_complete(dest, 0);
5338 break;
5339 case RSMIPC_MSG_RESUME:
5340 importer_resume(dest);
5341 break;
5342 default:
5343 ASSERT(0);
5345 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5346 "rsmipc_send done\n"));
5347 return (0);
5350 if (dest >= MAX_NODES) {
5351 DBG_PRINTF((category, RSM_ERR,
5352 "rsm: rsmipc_send bad node number %x\n", dest));
5353 return (RSMERR_REMOTE_NODE_UNREACHABLE);
5357 * Oh boy! we are going remote.
5361 * identify if we need to have credits to send this message
5362 * - only selected requests are flow controlled
5364 if (req != NULL) {
5365 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5366 "rsmipc_send:request type=%d\n",
5367 req->rsmipc_hdr.rsmipc_type));
5369 switch (req->rsmipc_hdr.rsmipc_type) {
5370 case RSMIPC_MSG_SEGCONNECT:
5371 case RSMIPC_MSG_DISCONNECT:
5372 case RSMIPC_MSG_IMPORTING:
5373 case RSMIPC_MSG_SUSPEND:
5374 case RSMIPC_MSG_SUSPEND_DONE:
5375 case RSMIPC_MSG_RESUME:
5376 credit_check = 1;
5377 break;
5378 default:
5379 credit_check = 0;
5383 again:
5384 if (retry_cnt++ == min_retry_cnt) {
5385 /* backoff before further retries for 10ms */
5386 delay(drv_usectohz(10000));
5387 retry_cnt = 0; /* reset retry_cnt */
5389 sendq_token = rsmka_get_sendq_token(dest, used_sendq_token);
5390 if (sendq_token == NULL) {
5391 DBG_PRINTF((category, RSM_ERR,
5392 "rsm: rsmipc_send no device to reach node %d\n", dest));
5393 return (RSMERR_REMOTE_NODE_UNREACHABLE);
5396 if ((sendq_token == used_sendq_token) &&
5397 ((e == RSMERR_CONN_ABORTED) || (e == RSMERR_TIMEOUT) ||
5398 (e == RSMERR_COMM_ERR_MAYBE_DELIVERED))) {
5399 rele_sendq_token(sendq_token);
5400 DBG_PRINTF((category, RSM_DEBUG, "rsmipc_send done=%d\n", e));
5401 return (RSMERR_CONN_ABORTED);
5402 } else
5403 used_sendq_token = sendq_token;
5405 /* lint -save -e413 */
5406 path = SQ_TOKEN_TO_PATH(sendq_token);
5407 adapter = path->local_adapter;
5408 /* lint -restore */
5409 ipc_handle = sendq_token->rsmpi_sendq_handle;
5411 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5412 "rsmipc_send: path=%lx sendq_hdl=%lx\n", path, ipc_handle));
5414 if (reply == NULL) {
5415 /* Send request without ack */
5417 * Set the rsmipc_version number in the msghdr for KA
5418 * communication versioning
5420 req->rsmipc_hdr.rsmipc_version = RSM_VERSION;
5421 req->rsmipc_hdr.rsmipc_src = my_nodeid;
5423 * remote endpoints incn should match the value in our
5424 * path's remote_incn field. No need to grab any lock
5425 * since we have refcnted the path in rsmka_get_sendq_token
5427 req->rsmipc_hdr.rsmipc_incn = path->remote_incn;
5429 is.is_data = (void *)req;
5430 is.is_size = sizeof (*req);
5431 is.is_flags = RSM_INTR_SEND_DELIVER | RSM_INTR_SEND_SLEEP;
5432 is.is_wait = 0;
5434 if (credit_check) {
5435 mutex_enter(&path->mutex);
5437 * wait till we recv credits or path goes down. If path
5438 * goes down rsm_send will fail and we handle the error
5439 * then
5441 while ((sendq_token->msgbuf_avail == 0) &&
5442 (path->state == RSMKA_PATH_ACTIVE)) {
5443 e = cv_wait_sig(&sendq_token->sendq_cv,
5444 &path->mutex);
5445 if (e == 0) {
5446 mutex_exit(&path->mutex);
5447 no_reply_cnt++;
5448 rele_sendq_token(sendq_token);
5449 DBG_PRINTF((category, RSM_DEBUG,
5450 "rsmipc_send done: "
5451 "cv_wait INTERRUPTED"));
5452 return (RSMERR_INTERRUPTED);
5457 * path is not active retry on another path.
5459 if (path->state != RSMKA_PATH_ACTIVE) {
5460 mutex_exit(&path->mutex);
5461 rele_sendq_token(sendq_token);
5462 e = RSMERR_CONN_ABORTED;
5463 DBG_PRINTF((category, RSM_ERR,
5464 "rsm: rsmipc_send: path !ACTIVE"));
5465 goto again;
5468 ASSERT(sendq_token->msgbuf_avail > 0);
5471 * reserve a msgbuf
5473 sendq_token->msgbuf_avail--;
5475 mutex_exit(&path->mutex);
5477 e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is,
5478 NULL);
5480 if (e != RSM_SUCCESS) {
5481 mutex_enter(&path->mutex);
5483 * release the reserved msgbuf since
5484 * the send failed
5486 sendq_token->msgbuf_avail++;
5487 cv_broadcast(&sendq_token->sendq_cv);
5488 mutex_exit(&path->mutex);
5490 } else
5491 e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is,
5492 NULL);
5494 no_reply_cnt++;
5495 rele_sendq_token(sendq_token);
5496 if (e != RSM_SUCCESS) {
5497 DBG_PRINTF((category, RSM_ERR,
5498 "rsm: rsmipc_send no reply send"
5499 " err = %d no reply count = %d\n",
5500 e, no_reply_cnt));
5501 ASSERT(e != RSMERR_QUEUE_FENCE_UP &&
5502 e != RSMERR_BAD_BARRIER_HNDL);
5503 atomic_inc_64(&rsm_ipcsend_errcnt);
5504 goto again;
5505 } else {
5506 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5507 "rsmipc_send done\n"));
5508 return (e);
5513 if (req == NULL) {
5514 /* Send reply - No flow control is done for reply */
5516 * Set the version in the msg header for KA communication
5517 * versioning
5519 reply->rsmipc_hdr.rsmipc_version = RSM_VERSION;
5520 reply->rsmipc_hdr.rsmipc_src = my_nodeid;
5521 /* incn number is not used for reply msgs currently */
5522 reply->rsmipc_hdr.rsmipc_incn = path->remote_incn;
5524 is.is_data = (void *)reply;
5525 is.is_size = sizeof (*reply);
5526 is.is_flags = RSM_INTR_SEND_DELIVER | RSM_INTR_SEND_SLEEP;
5527 is.is_wait = 0;
5528 e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is, NULL);
5529 rele_sendq_token(sendq_token);
5530 if (e != RSM_SUCCESS) {
5531 DBG_PRINTF((category, RSM_ERR,
5532 "rsm: rsmipc_send reply send"
5533 " err = %d\n", e));
5534 atomic_inc_64(&rsm_ipcsend_errcnt);
5535 goto again;
5536 } else {
5537 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5538 "rsmipc_send done\n"));
5539 return (e);
5543 /* Reply needed */
5544 rslot = rsmipc_alloc(); /* allocate a new ipc slot */
5546 mutex_enter(&rslot->rsmipc_lock);
5548 rslot->rsmipc_data = (void *)reply;
5549 RSMIPC_SET(rslot, RSMIPC_PENDING);
5551 while (RSMIPC_GET(rslot, RSMIPC_PENDING)) {
5553 * Set the rsmipc_version number in the msghdr for KA
5554 * communication versioning
5556 req->rsmipc_hdr.rsmipc_version = RSM_VERSION;
5557 req->rsmipc_hdr.rsmipc_src = my_nodeid;
5558 req->rsmipc_hdr.rsmipc_cookie = rslot->rsmipc_cookie;
5560 * remote endpoints incn should match the value in our
5561 * path's remote_incn field. No need to grab any lock
5562 * since we have refcnted the path in rsmka_get_sendq_token
5564 req->rsmipc_hdr.rsmipc_incn = path->remote_incn;
5566 is.is_data = (void *)req;
5567 is.is_size = sizeof (*req);
5568 is.is_flags = RSM_INTR_SEND_DELIVER | RSM_INTR_SEND_SLEEP;
5569 is.is_wait = 0;
5570 if (credit_check) {
5572 mutex_enter(&path->mutex);
5574 * wait till we recv credits or path goes down. If path
5575 * goes down rsm_send will fail and we handle the error
5576 * then.
5578 while ((sendq_token->msgbuf_avail == 0) &&
5579 (path->state == RSMKA_PATH_ACTIVE)) {
5580 e = cv_wait_sig(&sendq_token->sendq_cv,
5581 &path->mutex);
5582 if (e == 0) {
5583 mutex_exit(&path->mutex);
5584 RSMIPC_CLEAR(rslot, RSMIPC_PENDING);
5585 rsmipc_free(rslot);
5586 rele_sendq_token(sendq_token);
5587 DBG_PRINTF((category, RSM_DEBUG,
5588 "rsmipc_send done: "
5589 "cv_wait INTERRUPTED"));
5590 return (RSMERR_INTERRUPTED);
5595 * path is not active retry on another path.
5597 if (path->state != RSMKA_PATH_ACTIVE) {
5598 mutex_exit(&path->mutex);
5599 RSMIPC_CLEAR(rslot, RSMIPC_PENDING);
5600 rsmipc_free(rslot);
5601 rele_sendq_token(sendq_token);
5602 e = RSMERR_CONN_ABORTED;
5603 DBG_PRINTF((category, RSM_ERR,
5604 "rsm: rsmipc_send: path !ACTIVE"));
5605 goto again;
5608 ASSERT(sendq_token->msgbuf_avail > 0);
5611 * reserve a msgbuf
5613 sendq_token->msgbuf_avail--;
5615 mutex_exit(&path->mutex);
5617 e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is,
5618 NULL);
5620 if (e != RSM_SUCCESS) {
5621 mutex_enter(&path->mutex);
5623 * release the reserved msgbuf since
5624 * the send failed
5626 sendq_token->msgbuf_avail++;
5627 cv_broadcast(&sendq_token->sendq_cv);
5628 mutex_exit(&path->mutex);
5630 } else
5631 e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is,
5632 NULL);
5634 if (e != RSM_SUCCESS) {
5635 DBG_PRINTF((category, RSM_ERR,
5636 "rsm: rsmipc_send rsmpi send err = %d\n", e));
5637 RSMIPC_CLEAR(rslot, RSMIPC_PENDING);
5638 rsmipc_free(rslot);
5639 rele_sendq_token(sendq_token);
5640 atomic_inc_64(&rsm_ipcsend_errcnt);
5641 goto again;
5644 /* wait for a reply signal, a SIGINT, or 5 sec. timeout */
5645 e = cv_reltimedwait_sig(&rslot->rsmipc_cv, &rslot->rsmipc_lock,
5646 drv_usectohz(5000000), TR_CLOCK_TICK);
5647 if (e < 0) {
5648 /* timed out - retry */
5649 e = RSMERR_TIMEOUT;
5650 } else if (e == 0) {
5651 /* signalled - return error */
5652 e = RSMERR_INTERRUPTED;
5653 break;
5654 } else {
5655 e = RSM_SUCCESS;
5659 RSMIPC_CLEAR(rslot, RSMIPC_PENDING);
5660 rsmipc_free(rslot);
5661 rele_sendq_token(sendq_token);
5663 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_send done=%d\n", e));
5664 return (e);
5667 static int
5668 rsm_send_notimporting(rsm_node_id_t dest, rsm_memseg_id_t segid, void *cookie)
5670 rsmipc_request_t request;
5673 * inform the exporter to delete this importer
5675 request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_NOTIMPORTING;
5676 request.rsmipc_key = segid;
5677 request.rsmipc_segment_cookie = cookie;
5678 return (rsmipc_send(dest, &request, RSM_NO_REPLY));
5681 static void
5682 rsm_send_republish(rsm_memseg_id_t segid, rsmapi_access_entry_t *acl,
5683 int acl_len, rsm_permission_t default_permission)
5685 int i;
5686 importing_token_t *token;
5687 rsmipc_request_t request;
5688 republish_token_t *republish_list = NULL;
5689 republish_token_t *rp;
5690 rsm_permission_t permission;
5691 int index;
5694 * send the new access mode to all the nodes that have imported
5695 * this segment.
5696 * If the new acl does not have a node that was present in
5697 * the old acl a access permission of 0 is sent.
5700 index = rsmhash(segid);
5703 * create a list of node/permissions to send the republish message
5705 mutex_enter(&importer_list.lock);
5707 token = importer_list.bucket[index];
5708 while (token != NULL) {
5709 if (segid == token->key) {
5710 permission = default_permission;
5712 for (i = 0; i < acl_len; i++) {
5713 if (token->importing_node == acl[i].ae_node) {
5714 permission = acl[i].ae_permission;
5715 break;
5718 rp = kmem_zalloc(sizeof (republish_token_t), KM_SLEEP);
5720 rp->key = segid;
5721 rp->importing_node = token->importing_node;
5722 rp->permission = permission;
5723 rp->next = republish_list;
5724 republish_list = rp;
5726 token = token->next;
5729 mutex_exit(&importer_list.lock);
5731 request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_REPUBLISH;
5732 request.rsmipc_key = segid;
5734 while (republish_list != NULL) {
5735 request.rsmipc_perm = republish_list->permission;
5736 (void) rsmipc_send(republish_list->importing_node,
5737 &request, RSM_NO_REPLY);
5738 rp = republish_list;
5739 republish_list = republish_list->next;
5740 kmem_free(rp, sizeof (republish_token_t));
5744 static void
5745 rsm_send_suspend()
5747 int i, e;
5748 rsmipc_request_t request;
5749 list_element_t *tokp;
5750 list_element_t *head = NULL;
5751 importing_token_t *token;
5752 DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
5753 "rsm_send_suspend enter\n"));
5756 * create a list of node to send the suspend message
5758 * Currently the whole importer list is scanned and we obtain
5759 * all the nodes - this basically gets all nodes that at least
5760 * import one segment from the local node.
5762 * no need to grab the rsm_suspend_list lock here since we are
5763 * single threaded when suspend is called.
5766 mutex_enter(&importer_list.lock);
5767 for (i = 0; i < rsm_hash_size; i++) {
5769 token = importer_list.bucket[i];
5771 while (token != NULL) {
5773 tokp = head;
5776 * make sure that the token's node
5777 * is not already on the suspend list
5779 while (tokp != NULL) {
5780 if (tokp->nodeid == token->importing_node) {
5781 break;
5783 tokp = tokp->next;
5786 if (tokp == NULL) { /* not in suspend list */
5787 tokp = kmem_zalloc(sizeof (list_element_t),
5788 KM_SLEEP);
5789 tokp->nodeid = token->importing_node;
5790 tokp->next = head;
5791 head = tokp;
5794 token = token->next;
5797 mutex_exit(&importer_list.lock);
5799 if (head == NULL) { /* no importers so go ahead and quiesce segments */
5800 exporter_quiesce();
5801 return;
5804 mutex_enter(&rsm_suspend_list.list_lock);
5805 ASSERT(rsm_suspend_list.list_head == NULL);
5807 * update the suspend list righaway so that if a node dies the
5808 * pathmanager can set the NODE dead flag
5810 rsm_suspend_list.list_head = head;
5811 mutex_exit(&rsm_suspend_list.list_lock);
5813 tokp = head;
5815 while (tokp != NULL) {
5816 request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_SUSPEND;
5817 e = rsmipc_send(tokp->nodeid, &request, RSM_NO_REPLY);
5819 * Error in rsmipc_send currently happens due to inaccessibility
5820 * of the remote node.
5822 if (e == RSM_SUCCESS) { /* send failed - don't wait for ack */
5823 tokp->flags |= RSM_SUSPEND_ACKPENDING;
5826 tokp = tokp->next;
5829 DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
5830 "rsm_send_suspend done\n"));
5834 static void
5835 rsm_send_resume()
5837 rsmipc_request_t request;
5838 list_element_t *elem, *head;
5841 * save the suspend list so that we know where to send
5842 * the resume messages and make the suspend list head
5843 * NULL.
5845 mutex_enter(&rsm_suspend_list.list_lock);
5846 head = rsm_suspend_list.list_head;
5847 rsm_suspend_list.list_head = NULL;
5848 mutex_exit(&rsm_suspend_list.list_lock);
5850 while (head != NULL) {
5851 elem = head;
5852 head = head->next;
5854 request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_RESUME;
5856 (void) rsmipc_send(elem->nodeid, &request, RSM_NO_REPLY);
5858 kmem_free((void *)elem, sizeof (list_element_t));
5865 * This function takes path and sends a message using the sendq
5866 * corresponding to it. The RSMIPC_MSG_SQREADY, RSMIPC_MSG_SQREADY_ACK
5867 * and RSMIPC_MSG_CREDIT are sent using this function.
5870 rsmipc_send_controlmsg(path_t *path, int msgtype)
5872 int e;
5873 int retry_cnt = 0;
5874 int min_retry_cnt = 10;
5875 adapter_t *adapter;
5876 rsm_send_t is;
5877 rsm_send_q_handle_t ipc_handle;
5878 rsmipc_controlmsg_t msg;
5879 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_FLOWCONTROL);
5881 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5882 "rsmipc_send_controlmsg enter\n"));
5884 ASSERT(MUTEX_HELD(&path->mutex));
5886 adapter = path->local_adapter;
5888 DBG_PRINTF((category, RSM_DEBUG, "rsmipc_send_controlmsg:path=%lx "
5889 "msgtype=%d %lx:%llx->%lx:%llx procmsg=%d\n", path, msgtype,
5890 my_nodeid, adapter->hwaddr, path->remote_node,
5891 path->remote_hwaddr, path->procmsg_cnt));
5893 if (path->state != RSMKA_PATH_ACTIVE) {
5894 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5895 "rsmipc_send_controlmsg done: ! RSMKA_PATH_ACTIVE"));
5896 return (1);
5899 ipc_handle = path->sendq_token.rsmpi_sendq_handle;
5901 msg.rsmipc_hdr.rsmipc_version = RSM_VERSION;
5902 msg.rsmipc_hdr.rsmipc_src = my_nodeid;
5903 msg.rsmipc_hdr.rsmipc_type = msgtype;
5904 msg.rsmipc_hdr.rsmipc_incn = path->remote_incn;
5906 if (msgtype == RSMIPC_MSG_CREDIT)
5907 msg.rsmipc_credits = path->procmsg_cnt;
5909 msg.rsmipc_local_incn = path->local_incn;
5911 msg.rsmipc_adapter_hwaddr = adapter->hwaddr;
5912 /* incr the sendq, path refcnt */
5913 PATH_HOLD_NOLOCK(path);
5914 SENDQ_TOKEN_HOLD(path);
5916 do {
5917 /* drop the path lock before doing the rsm_send */
5918 mutex_exit(&path->mutex);
5920 is.is_data = (void *)&msg;
5921 is.is_size = sizeof (msg);
5922 is.is_flags = RSM_INTR_SEND_DELIVER | RSM_INTR_SEND_SLEEP;
5923 is.is_wait = 0;
5925 e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is, NULL);
5927 ASSERT(e != RSMERR_QUEUE_FENCE_UP &&
5928 e != RSMERR_BAD_BARRIER_HNDL);
5930 mutex_enter(&path->mutex);
5932 if (e == RSM_SUCCESS) {
5933 break;
5935 /* error counter for statistics */
5936 atomic_inc_64(&rsm_ctrlmsg_errcnt);
5938 DBG_PRINTF((category, RSM_ERR,
5939 "rsmipc_send_controlmsg:rsm_send error=%d", e));
5941 if (++retry_cnt == min_retry_cnt) { /* backoff before retry */
5942 (void) cv_reltimedwait(&path->sendq_token.sendq_cv,
5943 &path->mutex, drv_usectohz(10000), TR_CLOCK_TICK);
5944 retry_cnt = 0;
5946 } while (path->state == RSMKA_PATH_ACTIVE);
5948 /* decrement the sendq,path refcnt that we incr before rsm_send */
5949 SENDQ_TOKEN_RELE(path);
5950 PATH_RELE_NOLOCK(path);
5952 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5953 "rsmipc_send_controlmsg done=%d", e));
5954 return (e);
5958 * Called from rsm_force_unload and path_importer_disconnect. The memory
5959 * mapping for the imported segment is removed and the segment is
5960 * disconnected at the interconnect layer if disconnect_flag is TRUE.
5961 * rsm_force_unload will get disconnect_flag TRUE from rsm_intr_callback
5962 * and FALSE from rsm_rebind.
5964 * When subsequent accesses cause page faulting, the dummy page is mapped
5965 * to resolve the fault, and the mapping generation number is incremented
5966 * so that the application can be notified on a close barrier operation.
5968 * It is important to note that the caller of rsmseg_unload is responsible for
5969 * acquiring the segment lock before making a call to rsmseg_unload. This is
5970 * required to make the caller and rsmseg_unload thread safe. The segment lock
5971 * will be released by the rsmseg_unload function.
5973 void
5974 rsmseg_unload(rsmseg_t *im_seg)
5976 rsmcookie_t *hdl;
5977 void *shared_cookie;
5978 rsmipc_request_t request;
5979 uint_t maxprot;
5981 DBG_DEFINE(category,
5982 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
5984 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_unload enter\n"));
5986 ASSERT(im_seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
5988 /* wait until segment leaves the mapping state */
5989 while (im_seg->s_state == RSM_STATE_MAPPING)
5990 cv_wait(&im_seg->s_cv, &im_seg->s_lock);
5992 * An unload is only necessary if the segment is connected. However,
5993 * if the segment was on the import list in state RSM_STATE_CONNECTING
5994 * then a connection was in progress. Change to RSM_STATE_NEW
5995 * here to cause an early exit from the connection process.
5997 if (im_seg->s_state == RSM_STATE_NEW) {
5998 rsmseglock_release(im_seg);
5999 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6000 "rsmseg_unload done: RSM_STATE_NEW\n"));
6001 return;
6002 } else if (im_seg->s_state == RSM_STATE_CONNECTING) {
6003 im_seg->s_state = RSM_STATE_ABORT_CONNECT;
6004 rsmsharelock_acquire(im_seg);
6005 im_seg->s_share->rsmsi_state = RSMSI_STATE_ABORT_CONNECT;
6006 rsmsharelock_release(im_seg);
6007 rsmseglock_release(im_seg);
6008 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6009 "rsmseg_unload done: RSM_STATE_CONNECTING\n"));
6010 return;
6013 if (im_seg->s_flags & RSM_FORCE_DISCONNECT) {
6014 if (im_seg->s_ckl != NULL) {
6015 int e;
6016 /* Setup protections for remap */
6017 maxprot = PROT_USER;
6018 if (im_seg->s_mode & RSM_PERM_READ) {
6019 maxprot |= PROT_READ;
6021 if (im_seg->s_mode & RSM_PERM_WRITE) {
6022 maxprot |= PROT_WRITE;
6024 hdl = im_seg->s_ckl;
6025 for (; hdl != NULL; hdl = hdl->c_next) {
6026 e = devmap_umem_remap(hdl->c_dhp, rsm_dip,
6027 remap_cookie,
6028 hdl->c_off, hdl->c_len,
6029 maxprot, 0, NULL);
6031 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6032 "remap returns %d\n", e));
6036 (void) rsm_closeconnection(im_seg, &shared_cookie);
6038 if (shared_cookie != NULL) {
6040 * inform the exporting node so this import
6041 * can be deleted from the list of importers.
6043 request.rsmipc_hdr.rsmipc_type =
6044 RSMIPC_MSG_NOTIMPORTING;
6045 request.rsmipc_key = im_seg->s_segid;
6046 request.rsmipc_segment_cookie = shared_cookie;
6047 rsmseglock_release(im_seg);
6048 (void) rsmipc_send(im_seg->s_node, &request,
6049 RSM_NO_REPLY);
6050 } else {
6051 rsmseglock_release(im_seg);
6054 else
6055 rsmseglock_release(im_seg);
6057 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_unload done\n"));
6061 /* ****************************** Importer Calls ************************ */
6063 static int
6064 rsm_access(uid_t owner, gid_t group, int perm, int mode, const struct cred *cr)
6066 int shifts = 0;
6068 if (crgetuid(cr) != owner) {
6069 shifts += 3;
6070 if (!groupmember(group, cr))
6071 shifts += 3;
6074 mode &= ~(perm << shifts);
6076 if (mode == 0)
6077 return (0);
6079 return (secpolicy_rsm_access(cr, owner, mode));
6083 static int
6084 rsm_connect(rsmseg_t *seg, rsm_ioctlmsg_t *msg, cred_t *cred,
6085 intptr_t dataptr, int mode)
6087 int e;
6088 int recheck_state = 0;
6089 void *shared_cookie;
6090 rsmipc_request_t request;
6091 rsmipc_reply_t reply;
6092 rsm_permission_t access;
6093 adapter_t *adapter;
6094 rsm_addr_t addr = 0;
6095 rsm_import_share_t *sharedp;
6096 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT);
6098 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_connect enter\n"));
6100 adapter = rsm_getadapter(msg, mode);
6101 if (adapter == NULL) {
6102 DBG_PRINTF((category, RSM_ERR,
6103 "rsm_connect done:ENODEV adapter=NULL\n"));
6104 return (RSMERR_CTLR_NOT_PRESENT);
6107 if ((adapter == &loopback_adapter) && (msg->nodeid != my_nodeid)) {
6108 rsmka_release_adapter(adapter);
6109 DBG_PRINTF((category, RSM_ERR,
6110 "rsm_connect done:ENODEV loopback\n"));
6111 return (RSMERR_CTLR_NOT_PRESENT);
6115 ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
6116 ASSERT(seg->s_state == RSM_STATE_NEW);
6119 * Translate perm to access
6121 if (msg->perm & ~RSM_PERM_RDWR) {
6122 rsmka_release_adapter(adapter);
6123 DBG_PRINTF((category, RSM_ERR,
6124 "rsm_connect done:EINVAL invalid perms\n"));
6125 return (RSMERR_BAD_PERMS);
6127 access = 0;
6128 if (msg->perm & RSM_PERM_READ)
6129 access |= RSM_ACCESS_READ;
6130 if (msg->perm & RSM_PERM_WRITE)
6131 access |= RSM_ACCESS_WRITE;
6133 seg->s_node = msg->nodeid;
6136 * Adding to the import list locks the segment; release the segment
6137 * lock so we can get the reply for the send.
6139 e = rsmimport_add(seg, msg->key);
6140 if (e) {
6141 rsmka_release_adapter(adapter);
6142 DBG_PRINTF((category, RSM_ERR,
6143 "rsm_connect done:rsmimport_add failed %d\n", e));
6144 return (e);
6146 seg->s_state = RSM_STATE_CONNECTING;
6149 * Set the s_adapter field here so as to have a valid comparison of
6150 * the adapter and the s_adapter value during rsmshare_get. For
6151 * any error, set s_adapter to NULL before doing a release_adapter
6153 seg->s_adapter = adapter;
6155 rsmseglock_release(seg);
6158 * get the pointer to the shared data structure; the
6159 * shared data is locked and refcount has been incremented
6161 sharedp = rsmshare_get(msg->key, msg->nodeid, adapter, seg);
6163 ASSERT(rsmsharelock_held(seg));
6165 do {
6166 /* flag indicates whether we need to recheck the state */
6167 recheck_state = 0;
6168 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6169 "rsm_connect:RSMSI_STATE=%d\n", sharedp->rsmsi_state));
6170 switch (sharedp->rsmsi_state) {
6171 case RSMSI_STATE_NEW:
6172 sharedp->rsmsi_state = RSMSI_STATE_CONNECTING;
6173 break;
6174 case RSMSI_STATE_CONNECTING:
6175 /* FALLTHRU */
6176 case RSMSI_STATE_CONN_QUIESCE:
6177 /* FALLTHRU */
6178 case RSMSI_STATE_MAP_QUIESCE:
6179 /* wait for the state to change */
6180 while ((sharedp->rsmsi_state ==
6181 RSMSI_STATE_CONNECTING) ||
6182 (sharedp->rsmsi_state ==
6183 RSMSI_STATE_CONN_QUIESCE) ||
6184 (sharedp->rsmsi_state ==
6185 RSMSI_STATE_MAP_QUIESCE)) {
6186 if (cv_wait_sig(&sharedp->rsmsi_cv,
6187 &sharedp->rsmsi_lock) == 0) {
6188 /* signalled - clean up and return */
6189 rsmsharelock_release(seg);
6190 rsmimport_rm(seg);
6191 seg->s_adapter = NULL;
6192 rsmka_release_adapter(adapter);
6193 seg->s_state = RSM_STATE_NEW;
6194 DBG_PRINTF((category, RSM_ERR,
6195 "rsm_connect done: INTERRUPTED\n"));
6196 return (RSMERR_INTERRUPTED);
6200 * the state changed, loop back and check what it is
6202 recheck_state = 1;
6203 break;
6204 case RSMSI_STATE_ABORT_CONNECT:
6205 /* exit the loop and clean up further down */
6206 break;
6207 case RSMSI_STATE_CONNECTED:
6208 /* already connected, good - fall through */
6209 case RSMSI_STATE_MAPPED:
6210 /* already mapped, wow - fall through */
6211 /* access validation etc is done further down */
6212 break;
6213 case RSMSI_STATE_DISCONNECTED:
6214 /* disconnected - so reconnect now */
6215 sharedp->rsmsi_state = RSMSI_STATE_CONNECTING;
6216 break;
6217 default:
6218 ASSERT(0); /* Invalid State */
6220 } while (recheck_state);
6222 if (sharedp->rsmsi_state == RSMSI_STATE_CONNECTING) {
6223 /* we are the first to connect */
6224 rsmsharelock_release(seg);
6226 if (msg->nodeid != my_nodeid) {
6227 addr = get_remote_hwaddr(adapter, msg->nodeid);
6229 if ((int64_t)addr < 0) {
6230 rsmsharelock_acquire(seg);
6231 rsmsharecv_signal(seg, RSMSI_STATE_CONNECTING,
6232 RSMSI_STATE_NEW);
6233 rsmsharelock_release(seg);
6234 rsmimport_rm(seg);
6235 seg->s_adapter = NULL;
6236 rsmka_release_adapter(adapter);
6237 seg->s_state = RSM_STATE_NEW;
6238 DBG_PRINTF((category, RSM_ERR,
6239 "rsm_connect done: hwaddr<0\n"));
6240 return (RSMERR_INTERNAL_ERROR);
6242 } else {
6243 addr = adapter->hwaddr;
6247 * send request to node [src, dest, key, msgid] and get back
6248 * [status, msgid, cookie]
6250 request.rsmipc_key = msg->key;
6252 * we need the s_mode of the exporter so pass
6253 * RSM_ACCESS_TRUSTED
6255 request.rsmipc_perm = RSM_ACCESS_TRUSTED;
6256 request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_SEGCONNECT;
6257 request.rsmipc_adapter_hwaddr = addr;
6258 request.rsmipc_segment_cookie = sharedp;
6260 e = (int)rsmipc_send(msg->nodeid, &request, &reply);
6261 if (e) {
6262 rsmsharelock_acquire(seg);
6263 rsmsharecv_signal(seg, RSMSI_STATE_CONNECTING,
6264 RSMSI_STATE_NEW);
6265 rsmsharelock_release(seg);
6266 rsmimport_rm(seg);
6267 seg->s_adapter = NULL;
6268 rsmka_release_adapter(adapter);
6269 seg->s_state = RSM_STATE_NEW;
6270 DBG_PRINTF((category, RSM_ERR,
6271 "rsm_connect done:rsmipc_send failed %d\n", e));
6272 return (e);
6275 if (reply.rsmipc_status != RSM_SUCCESS) {
6276 rsmsharelock_acquire(seg);
6277 rsmsharecv_signal(seg, RSMSI_STATE_CONNECTING,
6278 RSMSI_STATE_NEW);
6279 rsmsharelock_release(seg);
6280 rsmimport_rm(seg);
6281 seg->s_adapter = NULL;
6282 rsmka_release_adapter(adapter);
6283 seg->s_state = RSM_STATE_NEW;
6284 DBG_PRINTF((category, RSM_ERR,
6285 "rsm_connect done:rsmipc_send reply err %d\n",
6286 reply.rsmipc_status));
6287 return (reply.rsmipc_status);
6290 rsmsharelock_acquire(seg);
6291 /* store the information recvd into the shared data struct */
6292 sharedp->rsmsi_mode = reply.rsmipc_mode;
6293 sharedp->rsmsi_uid = reply.rsmipc_uid;
6294 sharedp->rsmsi_gid = reply.rsmipc_gid;
6295 sharedp->rsmsi_seglen = reply.rsmipc_seglen;
6296 sharedp->rsmsi_cookie = sharedp;
6299 rsmsharelock_release(seg);
6302 * Get the segment lock and check for a force disconnect
6303 * from the export side which would have changed the state
6304 * back to RSM_STATE_NEW. Once the segment lock is acquired a
6305 * force disconnect will be held off until the connection
6306 * has completed.
6308 rsmseglock_acquire(seg);
6309 rsmsharelock_acquire(seg);
6310 ASSERT(seg->s_state == RSM_STATE_CONNECTING ||
6311 seg->s_state == RSM_STATE_ABORT_CONNECT);
6313 shared_cookie = sharedp->rsmsi_cookie;
6315 if ((seg->s_state == RSM_STATE_ABORT_CONNECT) ||
6316 (sharedp->rsmsi_state == RSMSI_STATE_ABORT_CONNECT)) {
6317 seg->s_state = RSM_STATE_NEW;
6318 seg->s_adapter = NULL;
6319 rsmsharelock_release(seg);
6320 rsmseglock_release(seg);
6321 rsmimport_rm(seg);
6322 rsmka_release_adapter(adapter);
6324 rsmsharelock_acquire(seg);
6325 if (!(sharedp->rsmsi_flags & RSMSI_FLAGS_ABORTDONE)) {
6327 * set a flag indicating abort handling has been
6328 * done
6330 sharedp->rsmsi_flags |= RSMSI_FLAGS_ABORTDONE;
6331 rsmsharelock_release(seg);
6332 /* send a message to exporter - only once */
6333 (void) rsm_send_notimporting(msg->nodeid,
6334 msg->key, shared_cookie);
6335 rsmsharelock_acquire(seg);
6337 * wake up any waiting importers and inform that
6338 * connection has been aborted
6340 cv_broadcast(&sharedp->rsmsi_cv);
6342 rsmsharelock_release(seg);
6344 DBG_PRINTF((category, RSM_ERR,
6345 "rsm_connect done: RSM_STATE_ABORT_CONNECT\n"));
6346 return (RSMERR_INTERRUPTED);
6351 * We need to verify that this process has access
6353 e = rsm_access(sharedp->rsmsi_uid, sharedp->rsmsi_gid,
6354 access & sharedp->rsmsi_mode,
6355 (int)(msg->perm & RSM_PERM_RDWR), cred);
6356 if (e) {
6357 rsmsharelock_release(seg);
6358 seg->s_state = RSM_STATE_NEW;
6359 seg->s_adapter = NULL;
6360 rsmseglock_release(seg);
6361 rsmimport_rm(seg);
6362 rsmka_release_adapter(adapter);
6364 * No need to lock segment it has been removed
6365 * from the hash table
6367 rsmsharelock_acquire(seg);
6368 if (sharedp->rsmsi_state == RSMSI_STATE_CONNECTING) {
6369 rsmsharelock_release(seg);
6370 /* this is the first importer */
6372 (void) rsm_send_notimporting(msg->nodeid, msg->key,
6373 shared_cookie);
6374 rsmsharelock_acquire(seg);
6375 sharedp->rsmsi_state = RSMSI_STATE_NEW;
6376 cv_broadcast(&sharedp->rsmsi_cv);
6378 rsmsharelock_release(seg);
6380 DBG_PRINTF((category, RSM_ERR,
6381 "rsm_connect done: ipcaccess failed\n"));
6382 return (RSMERR_PERM_DENIED);
6385 /* update state and cookie */
6386 seg->s_segid = sharedp->rsmsi_segid;
6387 seg->s_len = sharedp->rsmsi_seglen;
6388 seg->s_mode = access & sharedp->rsmsi_mode;
6389 seg->s_pid = ddi_get_pid();
6390 seg->s_mapinfo = NULL;
6392 if (seg->s_node != my_nodeid) {
6393 if (sharedp->rsmsi_state == RSMSI_STATE_CONNECTING) {
6394 e = adapter->rsmpi_ops->rsm_connect(
6395 adapter->rsmpi_handle,
6396 addr, seg->s_segid, &sharedp->rsmsi_handle);
6398 if (e != RSM_SUCCESS) {
6399 seg->s_state = RSM_STATE_NEW;
6400 seg->s_adapter = NULL;
6401 rsmsharelock_release(seg);
6402 rsmseglock_release(seg);
6403 rsmimport_rm(seg);
6404 rsmka_release_adapter(adapter);
6406 * inform the exporter to delete this importer
6408 (void) rsm_send_notimporting(msg->nodeid,
6409 msg->key, shared_cookie);
6412 * Now inform any waiting importers to
6413 * retry connect. This needs to be done
6414 * after sending notimporting so that
6415 * the notimporting is sent before a waiting
6416 * importer sends a segconnect while retrying
6418 * No need to lock segment it has been removed
6419 * from the hash table
6422 rsmsharelock_acquire(seg);
6423 sharedp->rsmsi_state = RSMSI_STATE_NEW;
6424 cv_broadcast(&sharedp->rsmsi_cv);
6425 rsmsharelock_release(seg);
6427 DBG_PRINTF((category, RSM_ERR,
6428 "rsm_connect error %d\n", e));
6429 if (e == RSMERR_SEG_NOT_PUBLISHED_TO_RSM_ADDR)
6430 return (
6431 RSMERR_SEG_NOT_PUBLISHED_TO_NODE);
6432 else if ((e == RSMERR_RSM_ADDR_UNREACHABLE) ||
6433 (e == RSMERR_UNKNOWN_RSM_ADDR))
6434 return (RSMERR_REMOTE_NODE_UNREACHABLE);
6435 else
6436 return (e);
6440 seg->s_handle.in = sharedp->rsmsi_handle;
6444 seg->s_state = RSM_STATE_CONNECT;
6447 seg->s_flags &= ~RSM_IMPORT_DUMMY; /* clear dummy flag */
6448 if (bar_va) {
6449 /* increment generation number on barrier page */
6450 atomic_inc_16(bar_va + seg->s_hdr.rsmrc_num);
6451 /* return user off into barrier page where status will be */
6452 msg->off = (int)seg->s_hdr.rsmrc_num;
6453 msg->gnum = bar_va[msg->off]; /* gnum race */
6454 } else {
6455 msg->off = 0;
6456 msg->gnum = 0; /* gnum race */
6459 msg->len = (int)sharedp->rsmsi_seglen;
6460 msg->rnum = seg->s_minor;
6461 rsmsharecv_signal(seg, RSMSI_STATE_CONNECTING, RSMSI_STATE_CONNECTED);
6462 rsmsharelock_release(seg);
6463 rsmseglock_release(seg);
6465 /* Return back to user the segment size & perm in case it's needed */
6467 #ifdef _MULTI_DATAMODEL
6468 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
6469 rsm_ioctlmsg32_t msg32;
6471 if (msg->len > UINT_MAX)
6472 msg32.len = RSM_MAXSZ_PAGE_ALIGNED;
6473 else
6474 msg32.len = msg->len;
6475 msg32.off = msg->off;
6476 msg32.perm = msg->perm;
6477 msg32.gnum = msg->gnum;
6478 msg32.rnum = msg->rnum;
6480 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6481 "rsm_connect done\n"));
6483 if (ddi_copyout((caddr_t)&msg32, (caddr_t)dataptr,
6484 sizeof (msg32), mode))
6485 return (RSMERR_BAD_ADDR);
6486 else
6487 return (RSM_SUCCESS);
6489 #endif
6490 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_connect done\n"));
6492 if (ddi_copyout((caddr_t)msg, (caddr_t)dataptr, sizeof (*msg),
6493 mode))
6494 return (RSMERR_BAD_ADDR);
6495 else
6496 return (RSM_SUCCESS);
6499 static int
6500 rsm_unmap(rsmseg_t *seg)
6502 int err;
6503 adapter_t *adapter;
6504 rsm_import_share_t *sharedp;
6505 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT);
6507 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6508 "rsm_unmap enter %u\n", seg->s_segid));
6510 ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
6512 /* assert seg is locked */
6513 ASSERT(rsmseglock_held(seg));
6514 ASSERT(seg->s_state != RSM_STATE_MAPPING);
6516 if ((seg->s_state != RSM_STATE_ACTIVE) &&
6517 (seg->s_state != RSM_STATE_MAP_QUIESCE)) {
6518 /* segment unmap has already been done */
6519 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unmap done\n"));
6520 return (RSM_SUCCESS);
6523 sharedp = seg->s_share;
6525 rsmsharelock_acquire(seg);
6528 * - shared data struct is in MAPPED or MAP_QUIESCE state
6531 ASSERT(sharedp->rsmsi_state == RSMSI_STATE_MAPPED ||
6532 sharedp->rsmsi_state == RSMSI_STATE_MAP_QUIESCE);
6535 * Unmap pages - previously rsm_memseg_import_unmap was called only if
6536 * the segment cookie list was NULL; but it is always NULL when
6537 * called from rsmmap_unmap and won't be NULL when called for
6538 * a force disconnect - so the check for NULL cookie list was removed
6541 ASSERT(sharedp->rsmsi_mapcnt > 0);
6543 sharedp->rsmsi_mapcnt--;
6545 if (sharedp->rsmsi_mapcnt == 0) {
6546 if (sharedp->rsmsi_state == RSMSI_STATE_MAPPED) {
6547 /* unmap the shared RSMPI mapping */
6548 adapter = seg->s_adapter;
6549 if (seg->s_node != my_nodeid) {
6550 ASSERT(sharedp->rsmsi_handle != NULL);
6551 err = adapter->rsmpi_ops->
6552 rsm_unmap(sharedp->rsmsi_handle);
6553 DBG_PRINTF((category, RSM_DEBUG,
6554 "rsm_unmap: rsmpi unmap %d\n", err));
6555 rsm_free_mapinfo(sharedp->rsmsi_mapinfo);
6556 sharedp->rsmsi_mapinfo = NULL;
6558 sharedp->rsmsi_state = RSMSI_STATE_CONNECTED;
6559 } else { /* MAP_QUIESCE --munmap()--> CONN_QUIESCE */
6560 sharedp->rsmsi_state = RSMSI_STATE_CONN_QUIESCE;
6564 rsmsharelock_release(seg);
6567 * The s_cookie field is used to store the cookie returned from the
6568 * ddi_umem_lock when binding the pages for an export segment. This
6569 * is the primary use of the s_cookie field and does not normally
6570 * pertain to any importing segment except in the loopback case.
6571 * For the loopback case, the import segment and export segment are
6572 * on the same node, the s_cookie field of the segment structure for
6573 * the importer is initialized to the s_cookie field in the exported
6574 * segment during the map operation and is used during the call to
6575 * devmap_umem_setup for the import mapping.
6576 * Thus, during unmap, we simply need to set s_cookie to NULL to
6577 * indicate that the mapping no longer exists.
6579 seg->s_cookie = NULL;
6581 seg->s_mapinfo = NULL;
6583 if (seg->s_state == RSM_STATE_ACTIVE)
6584 seg->s_state = RSM_STATE_CONNECT;
6585 else
6586 seg->s_state = RSM_STATE_CONN_QUIESCE;
6588 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unmap done\n"));
6590 return (RSM_SUCCESS);
6594 * cookie returned here if not null indicates that it is
6595 * the last importer and it can be used in the RSMIPC_NOT_IMPORTING
6596 * message.
6598 static int
6599 rsm_closeconnection(rsmseg_t *seg, void **cookie)
6601 int e;
6602 adapter_t *adapter;
6603 rsm_import_share_t *sharedp;
6604 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT);
6606 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6607 "rsm_closeconnection enter\n"));
6609 *cookie = NULL;
6611 ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
6613 /* assert seg is locked */
6614 ASSERT(rsmseglock_held(seg));
6616 if (seg->s_state == RSM_STATE_DISCONNECT) {
6617 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6618 "rsm_closeconnection done: already disconnected\n"));
6619 return (RSM_SUCCESS);
6622 /* wait for all putv/getv ops to get done */
6623 while (seg->s_rdmacnt > 0) {
6624 cv_wait(&seg->s_cv, &seg->s_lock);
6627 (void) rsm_unmap(seg);
6629 ASSERT(seg->s_state == RSM_STATE_CONNECT ||
6630 seg->s_state == RSM_STATE_CONN_QUIESCE);
6632 adapter = seg->s_adapter;
6633 sharedp = seg->s_share;
6635 ASSERT(sharedp != NULL);
6637 rsmsharelock_acquire(seg);
6640 * Disconnect on adapter
6642 * The current algorithm is stateless, I don't have to contact
6643 * server when I go away. It only gives me permissions. Of course,
6644 * the adapters will talk to terminate the connect.
6646 * disconnect is needed only if we are CONNECTED not in CONN_QUIESCE
6648 if ((sharedp->rsmsi_state == RSMSI_STATE_CONNECTED) &&
6649 (sharedp->rsmsi_node != my_nodeid)) {
6651 if (sharedp->rsmsi_refcnt == 1) {
6652 /* this is the last importer */
6653 ASSERT(sharedp->rsmsi_mapcnt == 0);
6655 e = adapter->rsmpi_ops->
6656 rsm_disconnect(sharedp->rsmsi_handle);
6657 if (e != RSM_SUCCESS) {
6658 DBG_PRINTF((category, RSM_DEBUG,
6659 "rsm:disconnect failed seg=%x:err=%d\n",
6660 seg->s_key, e));
6665 seg->s_handle.in = NULL;
6667 sharedp->rsmsi_refcnt--;
6669 if (sharedp->rsmsi_refcnt == 0) {
6670 *cookie = (void *)sharedp->rsmsi_cookie;
6671 sharedp->rsmsi_state = RSMSI_STATE_DISCONNECTED;
6672 sharedp->rsmsi_handle = NULL;
6673 rsmsharelock_release(seg);
6675 /* clean up the shared data structure */
6676 mutex_destroy(&sharedp->rsmsi_lock);
6677 cv_destroy(&sharedp->rsmsi_cv);
6678 kmem_free((void *)(sharedp), sizeof (rsm_import_share_t));
6680 } else {
6681 rsmsharelock_release(seg);
6684 /* increment generation number on barrier page */
6685 if (bar_va) {
6686 atomic_inc_16(bar_va + seg->s_hdr.rsmrc_num);
6690 * The following needs to be done after any
6691 * rsmsharelock calls which use seg->s_share.
6693 seg->s_share = NULL;
6695 seg->s_state = RSM_STATE_DISCONNECT;
6696 /* signal anyone waiting in the CONN_QUIESCE state */
6697 cv_broadcast(&seg->s_cv);
6699 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6700 "rsm_closeconnection done\n"));
6702 return (RSM_SUCCESS);
6706 rsm_disconnect(rsmseg_t *seg)
6708 rsmipc_request_t request;
6709 void *shared_cookie;
6710 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT);
6712 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_disconnect enter\n"));
6714 ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
6716 /* assert seg isn't locked */
6717 ASSERT(!rsmseglock_held(seg));
6720 /* Remove segment from imported list */
6721 rsmimport_rm(seg);
6723 /* acquire the segment */
6724 rsmseglock_acquire(seg);
6726 /* wait until segment leaves the mapping state */
6727 while (seg->s_state == RSM_STATE_MAPPING)
6728 cv_wait(&seg->s_cv, &seg->s_lock);
6730 if (seg->s_state == RSM_STATE_DISCONNECT) {
6731 seg->s_state = RSM_STATE_NEW;
6732 rsmseglock_release(seg);
6733 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6734 "rsm_disconnect done: already disconnected\n"));
6735 return (RSM_SUCCESS);
6738 (void) rsm_closeconnection(seg, &shared_cookie);
6740 /* update state */
6741 seg->s_state = RSM_STATE_NEW;
6743 if (shared_cookie != NULL) {
6745 * This is the last importer so inform the exporting node
6746 * so this import can be deleted from the list of importers.
6748 request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_NOTIMPORTING;
6749 request.rsmipc_key = seg->s_segid;
6750 request.rsmipc_segment_cookie = shared_cookie;
6751 rsmseglock_release(seg);
6752 (void) rsmipc_send(seg->s_node, &request, RSM_NO_REPLY);
6753 } else {
6754 rsmseglock_release(seg);
6757 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_disconnect done\n"));
6759 return (DDI_SUCCESS);
6762 static int
6763 rsm_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
6764 struct pollhead **phpp)
6766 minor_t rnum;
6767 rsmresource_t *res;
6768 rsmseg_t *seg;
6769 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
6771 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_chpoll enter\n"));
6773 /* find minor, no lock */
6774 rnum = getminor(dev);
6775 res = rsmresource_lookup(rnum, RSM_NOLOCK);
6777 /* poll is supported only for export/import segments */
6778 if ((res == NULL) || (res == RSMRC_RESERVED) ||
6779 (res->rsmrc_type == RSM_RESOURCE_BAR)) {
6780 return (ENXIO);
6784 * An exported segment must be in state RSM_STATE_EXPORT; an
6785 * imported segment must be in state RSM_STATE_ACTIVE.
6787 seg = (rsmseg_t *)res;
6789 if (seg->s_pollevent) {
6790 *reventsp = POLLRDNORM;
6791 } else {
6792 *reventsp = 0;
6795 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
6796 /* cannot take segment lock here */
6797 *phpp = &seg->s_poll;
6798 seg->s_pollflag |= RSM_SEGMENT_POLL;
6800 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_chpoll done\n"));
6801 return (0);
6806 /* ************************* IOCTL Commands ********************* */
6808 static rsmseg_t *
6809 rsmresource_seg(rsmresource_t *res, minor_t rnum, cred_t *credp,
6810 rsm_resource_type_t type)
6812 /* get segment from resource handle */
6813 rsmseg_t *seg;
6814 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_IOCTL);
6816 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmresource_seg enter\n"));
6819 if (res != RSMRC_RESERVED) {
6820 seg = (rsmseg_t *)res;
6821 } else {
6822 /* Allocate segment now and bind it */
6823 seg = rsmseg_alloc(rnum, credp);
6826 * if DR pre-processing is going on or DR is in progress
6827 * then the new export segments should be in the NEW_QSCD state
6829 if (type == RSM_RESOURCE_EXPORT_SEGMENT) {
6830 mutex_enter(&rsm_drv_data.drv_lock);
6831 if ((rsm_drv_data.drv_state ==
6832 RSM_DRV_PREDEL_STARTED) ||
6833 (rsm_drv_data.drv_state ==
6834 RSM_DRV_PREDEL_COMPLETED) ||
6835 (rsm_drv_data.drv_state ==
6836 RSM_DRV_DR_IN_PROGRESS)) {
6837 seg->s_state = RSM_STATE_NEW_QUIESCED;
6839 mutex_exit(&rsm_drv_data.drv_lock);
6842 rsmresource_insert(rnum, (rsmresource_t *)seg, type);
6845 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmresource_seg done\n"));
6847 return (seg);
6850 static int
6851 rsmexport_ioctl(rsmseg_t *seg, rsm_ioctlmsg_t *msg, int cmd, intptr_t arg,
6852 int mode, cred_t *credp)
6854 int error;
6855 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT | RSM_IOCTL);
6857 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmexport_ioctl enter\n"));
6859 arg = arg;
6860 credp = credp;
6862 ASSERT(seg != NULL);
6864 switch (cmd) {
6865 case RSM_IOCTL_BIND:
6866 error = rsm_bind(seg, msg, arg, mode);
6867 break;
6868 case RSM_IOCTL_REBIND:
6869 error = rsm_rebind(seg, msg);
6870 break;
6871 case RSM_IOCTL_UNBIND:
6872 error = ENOTSUP;
6873 break;
6874 case RSM_IOCTL_PUBLISH:
6875 error = rsm_publish(seg, msg, arg, mode);
6876 break;
6877 case RSM_IOCTL_REPUBLISH:
6878 error = rsm_republish(seg, msg, mode);
6879 break;
6880 case RSM_IOCTL_UNPUBLISH:
6881 error = rsm_unpublish(seg, 1);
6882 break;
6883 default:
6884 error = EINVAL;
6885 break;
6888 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmexport_ioctl done: %d\n",
6889 error));
6891 return (error);
6893 static int
6894 rsmimport_ioctl(rsmseg_t *seg, rsm_ioctlmsg_t *msg, int cmd, intptr_t arg,
6895 int mode, cred_t *credp)
6897 int error;
6898 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
6900 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmimport_ioctl enter\n"));
6902 ASSERT(seg);
6904 switch (cmd) {
6905 case RSM_IOCTL_CONNECT:
6906 error = rsm_connect(seg, msg, credp, arg, mode);
6907 break;
6908 default:
6909 error = EINVAL;
6910 break;
6913 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmimport_ioctl done: %d\n",
6914 error));
6915 return (error);
6918 static int
6919 rsmbar_ioctl(rsmseg_t *seg, rsm_ioctlmsg_t *msg, int cmd, intptr_t arg,
6920 int mode)
6922 int e;
6923 adapter_t *adapter;
6924 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
6926 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmbar_ioctl enter\n"));
6929 if ((seg->s_flags & RSM_IMPORT_DUMMY) != 0) {
6930 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6931 "rsmbar_ioctl done: RSM_IMPORT_DUMMY\n"));
6932 return (RSMERR_CONN_ABORTED);
6933 } else if (seg->s_node == my_nodeid) {
6934 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6935 "rsmbar_ioctl done: loopback\n"));
6936 return (RSM_SUCCESS);
6939 adapter = seg->s_adapter;
6941 switch (cmd) {
6942 case RSM_IOCTL_BAR_CHECK:
6943 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6944 "rsmbar_ioctl done: RSM_BAR_CHECK %d\n", bar_va));
6945 return (bar_va ? RSM_SUCCESS : EINVAL);
6946 case RSM_IOCTL_BAR_OPEN:
6947 e = adapter->rsmpi_ops->
6948 rsm_open_barrier_ctrl(adapter->rsmpi_handle, &msg->bar);
6949 break;
6950 case RSM_IOCTL_BAR_ORDER:
6951 e = adapter->rsmpi_ops->rsm_order_barrier(&msg->bar);
6952 break;
6953 case RSM_IOCTL_BAR_CLOSE:
6954 e = adapter->rsmpi_ops->rsm_close_barrier(&msg->bar);
6955 break;
6956 default:
6957 e = EINVAL;
6958 break;
6961 if (e == RSM_SUCCESS) {
6962 #ifdef _MULTI_DATAMODEL
6963 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
6964 rsm_ioctlmsg32_t msg32;
6965 int i;
6967 for (i = 0; i < 4; i++) {
6968 msg32.bar.comp[i].u64 = msg->bar.comp[i].u64;
6971 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6972 "rsmbar_ioctl done\n"));
6973 if (ddi_copyout((caddr_t)&msg32, (caddr_t)arg,
6974 sizeof (msg32), mode))
6975 return (RSMERR_BAD_ADDR);
6976 else
6977 return (RSM_SUCCESS);
6979 #endif
6980 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6981 "rsmbar_ioctl done\n"));
6982 if (ddi_copyout((caddr_t)&msg->bar, (caddr_t)arg,
6983 sizeof (*msg), mode))
6984 return (RSMERR_BAD_ADDR);
6985 else
6986 return (RSM_SUCCESS);
6989 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6990 "rsmbar_ioctl done: error=%d\n", e));
6992 return (e);
6996 * Ring the doorbell of the export segment to which this segment is
6997 * connected.
6999 static int
7000 exportbell_ioctl(rsmseg_t *seg, int cmd /*ARGSUSED*/)
7002 int e = 0;
7003 rsmipc_request_t request;
7005 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
7007 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "exportbell_ioctl enter\n"));
7009 request.rsmipc_key = seg->s_segid;
7010 request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_BELL;
7011 request.rsmipc_segment_cookie = NULL;
7012 e = rsmipc_send(seg->s_node, &request, RSM_NO_REPLY);
7014 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7015 "exportbell_ioctl done: %d\n", e));
7017 return (e);
7021 * Ring the doorbells of all segments importing this segment
7023 static int
7024 importbell_ioctl(rsmseg_t *seg, int cmd /*ARGSUSED*/)
7026 importing_token_t *token = NULL;
7027 rsmipc_request_t request;
7028 int index;
7030 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT | RSM_IOCTL);
7032 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importbell_ioctl enter\n"));
7034 ASSERT(seg->s_state != RSM_STATE_NEW &&
7035 seg->s_state != RSM_STATE_NEW_QUIESCED);
7037 request.rsmipc_key = seg->s_segid;
7038 request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_BELL;
7040 index = rsmhash(seg->s_segid);
7042 token = importer_list.bucket[index];
7044 while (token != NULL) {
7045 if (seg->s_key == token->key) {
7046 request.rsmipc_segment_cookie =
7047 token->import_segment_cookie;
7048 (void) rsmipc_send(token->importing_node,
7049 &request, RSM_NO_REPLY);
7051 token = token->next;
7054 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7055 "importbell_ioctl done\n"));
7056 return (RSM_SUCCESS);
7059 static int
7060 rsm_consumeevent_copyin(caddr_t arg, rsm_consume_event_msg_t *msgp,
7061 rsm_poll_event_t **eventspp, int mode)
7063 rsm_poll_event_t *evlist = NULL;
7064 size_t evlistsz;
7065 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IOCTL);
7067 #ifdef _MULTI_DATAMODEL
7068 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7069 int i;
7070 rsm_consume_event_msg32_t cemsg32 = {0};
7071 rsm_poll_event32_t event32[RSM_MAX_POLLFDS];
7072 rsm_poll_event32_t *evlist32;
7073 size_t evlistsz32;
7075 /* copyin the ioctl message */
7076 if (ddi_copyin(arg, (caddr_t)&cemsg32,
7077 sizeof (rsm_consume_event_msg32_t), mode)) {
7078 DBG_PRINTF((category, RSM_ERR,
7079 "consumeevent_copyin msgp: RSMERR_BAD_ADDR\n"));
7080 return (RSMERR_BAD_ADDR);
7082 msgp->seglist = (caddr_t)(uintptr_t)cemsg32.seglist;
7083 msgp->numents = (int)cemsg32.numents;
7085 evlistsz32 = sizeof (rsm_poll_event32_t) * msgp->numents;
7087 * If numents is large alloc events list on heap otherwise
7088 * use the address of array that was passed in.
7090 if (msgp->numents > RSM_MAX_POLLFDS) {
7091 if (msgp->numents > max_segs) { /* validate numents */
7092 DBG_PRINTF((category, RSM_ERR,
7093 "consumeevent_copyin: "
7094 "RSMERR_BAD_ARGS_ERRORS\n"));
7095 return (RSMERR_BAD_ARGS_ERRORS);
7097 evlist32 = kmem_zalloc(evlistsz32, KM_SLEEP);
7098 } else {
7099 evlist32 = event32;
7102 /* copyin the seglist into the rsm_poll_event32_t array */
7103 if (ddi_copyin((caddr_t)msgp->seglist, (caddr_t)evlist32,
7104 evlistsz32, mode)) {
7105 if ((msgp->numents > RSM_MAX_POLLFDS) && evlist32) {
7106 kmem_free(evlist32, evlistsz32);
7108 DBG_PRINTF((category, RSM_ERR,
7109 "consumeevent_copyin evlist: RSMERR_BAD_ADDR\n"));
7110 return (RSMERR_BAD_ADDR);
7113 /* evlist and evlistsz are based on rsm_poll_event_t type */
7114 evlistsz = sizeof (rsm_poll_event_t)* msgp->numents;
7116 if (msgp->numents > RSM_MAX_POLLFDS) {
7117 evlist = kmem_zalloc(evlistsz, KM_SLEEP);
7118 *eventspp = evlist;
7119 } else {
7120 evlist = *eventspp;
7123 * copy the rsm_poll_event32_t array to the rsm_poll_event_t
7124 * array
7126 for (i = 0; i < msgp->numents; i++) {
7127 evlist[i].rnum = evlist32[i].rnum;
7128 evlist[i].fdsidx = evlist32[i].fdsidx;
7129 evlist[i].revent = evlist32[i].revent;
7131 /* free the temp 32-bit event list */
7132 if ((msgp->numents > RSM_MAX_POLLFDS) && evlist32) {
7133 kmem_free(evlist32, evlistsz32);
7136 return (RSM_SUCCESS);
7138 #endif
7139 /* copyin the ioctl message */
7140 if (ddi_copyin(arg, (caddr_t)msgp, sizeof (rsm_consume_event_msg_t),
7141 mode)) {
7142 DBG_PRINTF((category, RSM_ERR,
7143 "consumeevent_copyin msgp: RSMERR_BAD_ADDR\n"));
7144 return (RSMERR_BAD_ADDR);
7147 * If numents is large alloc events list on heap otherwise
7148 * use the address of array that was passed in.
7150 if (msgp->numents > RSM_MAX_POLLFDS) {
7151 if (msgp->numents > max_segs) { /* validate numents */
7152 DBG_PRINTF((category, RSM_ERR,
7153 "consumeevent_copyin: RSMERR_BAD_ARGS_ERRORS\n"));
7154 return (RSMERR_BAD_ARGS_ERRORS);
7156 evlistsz = sizeof (rsm_poll_event_t)*msgp->numents;
7157 evlist = kmem_zalloc(evlistsz, KM_SLEEP);
7158 *eventspp = evlist;
7161 /* copyin the seglist */
7162 if (ddi_copyin((caddr_t)msgp->seglist, (caddr_t)(*eventspp),
7163 sizeof (rsm_poll_event_t)*msgp->numents, mode)) {
7164 if (evlist) {
7165 kmem_free(evlist, evlistsz);
7166 *eventspp = NULL;
7168 DBG_PRINTF((category, RSM_ERR,
7169 "consumeevent_copyin evlist: RSMERR_BAD_ADDR\n"));
7170 return (RSMERR_BAD_ADDR);
7173 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7174 "consumeevent_copyin done\n"));
7175 return (RSM_SUCCESS);
7178 static int
7179 rsm_consumeevent_copyout(rsm_consume_event_msg_t *msgp,
7180 rsm_poll_event_t *eventsp, int mode)
7182 size_t evlistsz;
7183 int err = RSM_SUCCESS;
7184 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IOCTL);
7186 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7187 "consumeevent_copyout enter: numents(%d) eventsp(%p)\n",
7188 msgp->numents, eventsp));
7190 #ifdef _MULTI_DATAMODEL
7191 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7192 int i;
7193 rsm_poll_event32_t event32[RSM_MAX_POLLFDS];
7194 rsm_poll_event32_t *evlist32;
7195 size_t evlistsz32;
7197 evlistsz32 = sizeof (rsm_poll_event32_t)*msgp->numents;
7198 if (msgp->numents > RSM_MAX_POLLFDS) {
7199 evlist32 = kmem_zalloc(evlistsz32, KM_SLEEP);
7200 } else {
7201 evlist32 = event32;
7205 * copy the rsm_poll_event_t array to the rsm_poll_event32_t
7206 * array
7208 for (i = 0; i < msgp->numents; i++) {
7209 evlist32[i].rnum = eventsp[i].rnum;
7210 evlist32[i].fdsidx = eventsp[i].fdsidx;
7211 evlist32[i].revent = eventsp[i].revent;
7214 if (ddi_copyout((caddr_t)evlist32, (caddr_t)msgp->seglist,
7215 evlistsz32, mode)) {
7216 err = RSMERR_BAD_ADDR;
7219 if (msgp->numents > RSM_MAX_POLLFDS) {
7220 if (evlist32) { /* free the temp 32-bit event list */
7221 kmem_free(evlist32, evlistsz32);
7224 * eventsp and evlistsz are based on rsm_poll_event_t
7225 * type
7227 evlistsz = sizeof (rsm_poll_event_t)*msgp->numents;
7228 /* event list on the heap and needs to be freed here */
7229 if (eventsp) {
7230 kmem_free(eventsp, evlistsz);
7234 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7235 "consumeevent_copyout done: err=%d\n", err));
7236 return (err);
7238 #endif
7239 evlistsz = sizeof (rsm_poll_event_t)*msgp->numents;
7241 if (ddi_copyout((caddr_t)eventsp, (caddr_t)msgp->seglist, evlistsz,
7242 mode)) {
7243 err = RSMERR_BAD_ADDR;
7246 if ((msgp->numents > RSM_MAX_POLLFDS) && eventsp) {
7247 /* event list on the heap and needs to be freed here */
7248 kmem_free(eventsp, evlistsz);
7251 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7252 "consumeevent_copyout done: err=%d\n", err));
7253 return (err);
7256 static int
7257 rsm_consumeevent_ioctl(caddr_t arg, int mode)
7259 int rc;
7260 int i;
7261 minor_t rnum;
7262 rsm_consume_event_msg_t msg = {0};
7263 rsmseg_t *seg;
7264 rsm_poll_event_t *event_list;
7265 rsm_poll_event_t events[RSM_MAX_POLLFDS];
7266 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IOCTL);
7268 event_list = events;
7270 if ((rc = rsm_consumeevent_copyin(arg, &msg, &event_list, mode)) !=
7271 RSM_SUCCESS) {
7272 return (rc);
7275 for (i = 0; i < msg.numents; i++) {
7276 rnum = event_list[i].rnum;
7277 event_list[i].revent = 0;
7278 /* get the segment structure */
7279 seg = (rsmseg_t *)rsmresource_lookup(rnum, RSM_LOCK);
7280 if (seg) {
7281 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7282 "consumeevent_ioctl: rnum(%d) seg(%p)\n", rnum,
7283 seg));
7284 if (seg->s_pollevent) {
7285 /* consume the event */
7286 atomic_dec_32(&seg->s_pollevent);
7287 event_list[i].revent = POLLRDNORM;
7289 rsmseglock_release(seg);
7293 if ((rc = rsm_consumeevent_copyout(&msg, event_list, mode)) !=
7294 RSM_SUCCESS) {
7295 return (rc);
7298 return (RSM_SUCCESS);
7301 static int
7302 iovec_copyin(caddr_t user_vec, rsmka_iovec_t *iovec, int count, int mode)
7304 int size;
7305 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
7307 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "iovec_copyin enter\n"));
7309 #ifdef _MULTI_DATAMODEL
7310 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7311 rsmka_iovec32_t *iovec32, *iovec32_base;
7312 int i;
7314 size = count * sizeof (rsmka_iovec32_t);
7315 iovec32_base = iovec32 = kmem_zalloc(size, KM_SLEEP);
7316 if (ddi_copyin((caddr_t)user_vec,
7317 (caddr_t)iovec32, size, mode)) {
7318 kmem_free(iovec32, size);
7319 DBG_PRINTF((category, RSM_DEBUG,
7320 "iovec_copyin: returning RSMERR_BAD_ADDR\n"));
7321 return (RSMERR_BAD_ADDR);
7324 for (i = 0; i < count; i++, iovec++, iovec32++) {
7325 iovec->io_type = (int)iovec32->io_type;
7326 if (iovec->io_type == RSM_HANDLE_TYPE)
7327 iovec->local.segid = (rsm_memseg_id_t)
7328 iovec32->local;
7329 else
7330 iovec->local.vaddr =
7331 (caddr_t)(uintptr_t)iovec32->local;
7332 iovec->local_offset = (size_t)iovec32->local_offset;
7333 iovec->remote_offset = (size_t)iovec32->remote_offset;
7334 iovec->transfer_len = (size_t)iovec32->transfer_len;
7337 kmem_free(iovec32_base, size);
7338 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7339 "iovec_copyin done\n"));
7340 return (DDI_SUCCESS);
7342 #endif
7344 size = count * sizeof (rsmka_iovec_t);
7345 if (ddi_copyin((caddr_t)user_vec, (caddr_t)iovec, size, mode)) {
7346 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7347 "iovec_copyin done: RSMERR_BAD_ADDR\n"));
7348 return (RSMERR_BAD_ADDR);
7351 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "iovec_copyin done\n"));
7353 return (DDI_SUCCESS);
7357 static int
7358 sgio_copyin(caddr_t arg, rsmka_scat_gath_t *sg_io, int mode)
7360 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
7362 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "sgio_copyin enter\n"));
7364 #ifdef _MULTI_DATAMODEL
7365 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7366 rsmka_scat_gath32_t sg_io32;
7368 if (ddi_copyin(arg, (caddr_t)&sg_io32, sizeof (sg_io32),
7369 mode)) {
7370 DBG_PRINTF((category, RSM_DEBUG,
7371 "sgio_copyin done: returning EFAULT\n"));
7372 return (RSMERR_BAD_ADDR);
7374 sg_io->local_nodeid = (rsm_node_id_t)sg_io32.local_nodeid;
7375 sg_io->io_request_count = (size_t)sg_io32.io_request_count;
7376 sg_io->io_residual_count = (size_t)sg_io32.io_residual_count;
7377 sg_io->flags = (size_t)sg_io32.flags;
7378 sg_io->remote_handle = (rsm_memseg_import_handle_t)
7379 (uintptr_t)sg_io32.remote_handle;
7380 sg_io->iovec = (rsmka_iovec_t *)(uintptr_t)sg_io32.iovec;
7381 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7382 "sgio_copyin done\n"));
7383 return (DDI_SUCCESS);
7385 #endif
7386 if (ddi_copyin(arg, (caddr_t)sg_io, sizeof (rsmka_scat_gath_t),
7387 mode)) {
7388 DBG_PRINTF((category, RSM_DEBUG,
7389 "sgio_copyin done: returning EFAULT\n"));
7390 return (RSMERR_BAD_ADDR);
7392 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "sgio_copyin done\n"));
7393 return (DDI_SUCCESS);
7396 static int
7397 sgio_resid_copyout(caddr_t arg, rsmka_scat_gath_t *sg_io, int mode)
7399 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
7401 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7402 "sgio_resid_copyout enter\n"));
7404 #ifdef _MULTI_DATAMODEL
7405 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7406 rsmka_scat_gath32_t sg_io32;
7408 sg_io32.io_residual_count = sg_io->io_residual_count;
7409 sg_io32.flags = sg_io->flags;
7411 if (ddi_copyout((caddr_t)&sg_io32.io_residual_count,
7412 (caddr_t)&((rsmka_scat_gath32_t *)arg)->io_residual_count,
7413 sizeof (uint32_t), mode)) {
7415 DBG_PRINTF((category, RSM_ERR,
7416 "sgio_resid_copyout error: rescnt\n"));
7417 return (RSMERR_BAD_ADDR);
7420 if (ddi_copyout((caddr_t)&sg_io32.flags,
7421 (caddr_t)&((rsmka_scat_gath32_t *)arg)->flags,
7422 sizeof (uint32_t), mode)) {
7424 DBG_PRINTF((category, RSM_ERR,
7425 "sgio_resid_copyout error: flags\n"));
7426 return (RSMERR_BAD_ADDR);
7428 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7429 "sgio_resid_copyout done\n"));
7430 return (DDI_SUCCESS);
7432 #endif
7433 if (ddi_copyout((caddr_t)&sg_io->io_residual_count,
7434 (caddr_t)&((rsmka_scat_gath_t *)arg)->io_residual_count,
7435 sizeof (ulong_t), mode)) {
7437 DBG_PRINTF((category, RSM_ERR,
7438 "sgio_resid_copyout error:rescnt\n"));
7439 return (RSMERR_BAD_ADDR);
7442 if (ddi_copyout((caddr_t)&sg_io->flags,
7443 (caddr_t)&((rsmka_scat_gath_t *)arg)->flags,
7444 sizeof (uint_t), mode)) {
7446 DBG_PRINTF((category, RSM_ERR,
7447 "sgio_resid_copyout error:flags\n"));
7448 return (RSMERR_BAD_ADDR);
7451 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "sgio_resid_copyout done\n"));
7452 return (DDI_SUCCESS);
7456 static int
7457 rsm_iovec_ioctl(dev_t dev, caddr_t arg, int cmd, int mode, cred_t *credp)
7459 rsmka_scat_gath_t sg_io;
7460 rsmka_iovec_t ka_iovec_arr[RSM_MAX_IOVLEN];
7461 rsmka_iovec_t *ka_iovec;
7462 rsmka_iovec_t *ka_iovec_start;
7463 rsmpi_scat_gath_t rsmpi_sg_io;
7464 rsmpi_iovec_t iovec_arr[RSM_MAX_IOVLEN];
7465 rsmpi_iovec_t *iovec;
7466 rsmpi_iovec_t *iovec_start = NULL;
7467 rsmapi_access_entry_t *acl;
7468 rsmresource_t *res;
7469 minor_t rnum;
7470 rsmseg_t *im_seg, *ex_seg;
7471 int e;
7472 int error = 0;
7473 uint_t i;
7474 uint_t iov_proc = 0; /* num of iovecs processed */
7475 size_t size = 0;
7476 size_t ka_size;
7478 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
7480 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_iovec_ioctl enter\n"));
7482 credp = credp;
7485 * Copyin the scatter/gather structure and build new structure
7486 * for rsmpi.
7488 e = sgio_copyin(arg, &sg_io, mode);
7489 if (e != DDI_SUCCESS) {
7490 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7491 "rsm_iovec_ioctl done: sgio_copyin %d\n", e));
7492 return (e);
7495 if (sg_io.io_request_count > RSM_MAX_SGIOREQS) {
7496 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7497 "rsm_iovec_ioctl done: request_count(%d) too large\n",
7498 sg_io.io_request_count));
7499 return (RSMERR_BAD_SGIO);
7502 rsmpi_sg_io.io_request_count = sg_io.io_request_count;
7503 rsmpi_sg_io.io_residual_count = sg_io.io_request_count;
7504 rsmpi_sg_io.io_segflg = 0;
7506 /* Allocate memory and copyin io vector array */
7507 if (sg_io.io_request_count > RSM_MAX_IOVLEN) {
7508 ka_size = sg_io.io_request_count * sizeof (rsmka_iovec_t);
7509 ka_iovec_start = ka_iovec = kmem_zalloc(ka_size, KM_SLEEP);
7510 } else {
7511 ka_iovec_start = ka_iovec = ka_iovec_arr;
7513 e = iovec_copyin((caddr_t)sg_io.iovec, ka_iovec,
7514 sg_io.io_request_count, mode);
7515 if (e != DDI_SUCCESS) {
7516 if (sg_io.io_request_count > RSM_MAX_IOVLEN)
7517 kmem_free(ka_iovec, ka_size);
7518 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7519 "rsm_iovec_ioctl done: iovec_copyin %d\n", e));
7520 return (e);
7523 /* get the import segment descriptor */
7524 rnum = getminor(dev);
7525 res = rsmresource_lookup(rnum, RSM_LOCK);
7528 * The following sequence of locking may (or MAY NOT) cause a
7529 * deadlock but this is currently not addressed here since the
7530 * implementation will be changed to incorporate the use of
7531 * reference counting for both the import and the export segments.
7534 /* rsmseglock_acquire(im_seg) done in rsmresource_lookup */
7536 im_seg = (rsmseg_t *)res;
7538 if (im_seg == NULL) {
7539 if (sg_io.io_request_count > RSM_MAX_IOVLEN)
7540 kmem_free(ka_iovec, ka_size);
7541 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7542 "rsm_iovec_ioctl done: rsmresource_lookup failed\n"));
7543 return (EINVAL);
7545 /* putv/getv supported is supported only on import segments */
7546 if (im_seg->s_type != RSM_RESOURCE_IMPORT_SEGMENT) {
7547 rsmseglock_release(im_seg);
7548 if (sg_io.io_request_count > RSM_MAX_IOVLEN)
7549 kmem_free(ka_iovec, ka_size);
7550 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7551 "rsm_iovec_ioctl done: not an import segment\n"));
7552 return (EINVAL);
7556 * wait for a remote DR to complete ie. for segments to get UNQUIESCED
7557 * as well as wait for a local DR to complete.
7559 while ((im_seg->s_state == RSM_STATE_CONN_QUIESCE) ||
7560 (im_seg->s_state == RSM_STATE_MAP_QUIESCE) ||
7561 (im_seg->s_flags & RSM_DR_INPROGRESS)) {
7562 if (cv_wait_sig(&im_seg->s_cv, &im_seg->s_lock) == 0) {
7563 DBG_PRINTF((category, RSM_DEBUG,
7564 "rsm_iovec_ioctl done: cv_wait INTR"));
7565 rsmseglock_release(im_seg);
7566 return (RSMERR_INTERRUPTED);
7570 if ((im_seg->s_state != RSM_STATE_CONNECT) &&
7571 (im_seg->s_state != RSM_STATE_ACTIVE)) {
7573 ASSERT(im_seg->s_state == RSM_STATE_DISCONNECT ||
7574 im_seg->s_state == RSM_STATE_NEW);
7576 DBG_PRINTF((category, RSM_DEBUG,
7577 "rsm_iovec_ioctl done: im_seg not conn/map"));
7578 rsmseglock_release(im_seg);
7579 e = RSMERR_BAD_SGIO;
7580 goto out;
7583 im_seg->s_rdmacnt++;
7584 rsmseglock_release(im_seg);
7587 * Allocate and set up the io vector for rsmpi
7589 if (sg_io.io_request_count > RSM_MAX_IOVLEN) {
7590 size = sg_io.io_request_count * sizeof (rsmpi_iovec_t);
7591 iovec_start = iovec = kmem_zalloc(size, KM_SLEEP);
7592 } else {
7593 iovec_start = iovec = iovec_arr;
7596 rsmpi_sg_io.iovec = iovec;
7597 for (iov_proc = 0; iov_proc < sg_io.io_request_count; iov_proc++) {
7598 if (ka_iovec->io_type == RSM_HANDLE_TYPE) {
7599 ex_seg = rsmexport_lookup(ka_iovec->local.segid);
7601 if (ex_seg == NULL) {
7602 e = RSMERR_BAD_SGIO;
7603 break;
7605 ASSERT(ex_seg->s_state == RSM_STATE_EXPORT);
7607 acl = ex_seg->s_acl;
7608 if (acl[0].ae_permission == 0) {
7609 struct buf *xbuf;
7610 dev_t sdev = 0;
7612 xbuf = ddi_umem_iosetup(ex_seg->s_cookie,
7613 0, ex_seg->s_len, B_WRITE,
7614 sdev, 0, NULL, DDI_UMEM_SLEEP);
7616 ASSERT(xbuf != NULL);
7618 iovec->local_mem.ms_type = RSM_MEM_BUF;
7619 iovec->local_mem.ms_memory.bp = xbuf;
7620 } else {
7621 iovec->local_mem.ms_type = RSM_MEM_HANDLE;
7622 iovec->local_mem.ms_memory.handle =
7623 ex_seg->s_handle.out;
7625 ex_seg->s_rdmacnt++; /* refcnt the handle */
7626 rsmseglock_release(ex_seg);
7627 } else {
7628 iovec->local_mem.ms_type = RSM_MEM_VADDR;
7629 iovec->local_mem.ms_memory.vr.vaddr =
7630 ka_iovec->local.vaddr;
7633 iovec->local_offset = ka_iovec->local_offset;
7634 iovec->remote_handle = im_seg->s_handle.in;
7635 iovec->remote_offset = ka_iovec->remote_offset;
7636 iovec->transfer_length = ka_iovec->transfer_len;
7637 iovec++;
7638 ka_iovec++;
7641 if (iov_proc < sg_io.io_request_count) {
7642 /* error while processing handle */
7643 rsmseglock_acquire(im_seg);
7644 im_seg->s_rdmacnt--; /* decrement the refcnt for importseg */
7645 if (im_seg->s_rdmacnt == 0) {
7646 cv_broadcast(&im_seg->s_cv);
7648 rsmseglock_release(im_seg);
7649 goto out;
7652 /* call rsmpi */
7653 if (cmd == RSM_IOCTL_PUTV)
7654 e = im_seg->s_adapter->rsmpi_ops->rsm_memseg_import_putv(
7655 im_seg->s_adapter->rsmpi_handle,
7656 &rsmpi_sg_io);
7657 else if (cmd == RSM_IOCTL_GETV)
7658 e = im_seg->s_adapter->rsmpi_ops->rsm_memseg_import_getv(
7659 im_seg->s_adapter->rsmpi_handle,
7660 &rsmpi_sg_io);
7661 else {
7662 e = EINVAL;
7663 DBG_PRINTF((category, RSM_DEBUG,
7664 "iovec_ioctl: bad command = %x\n", cmd));
7668 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7669 "rsm_iovec_ioctl RSMPI oper done %d\n", e));
7671 sg_io.io_residual_count = rsmpi_sg_io.io_residual_count;
7674 * Check for implicit signal post flag and do the signal
7675 * post if needed
7677 if (sg_io.flags & RSM_IMPLICIT_SIGPOST &&
7678 e == RSM_SUCCESS) {
7679 rsmipc_request_t request;
7681 request.rsmipc_key = im_seg->s_segid;
7682 request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_BELL;
7683 request.rsmipc_segment_cookie = NULL;
7684 e = rsmipc_send(im_seg->s_node, &request, RSM_NO_REPLY);
7686 * Reset the implicit signal post flag to 0 to indicate
7687 * that the signal post has been done and need not be
7688 * done in the RSMAPI library
7690 sg_io.flags &= ~RSM_IMPLICIT_SIGPOST;
7693 rsmseglock_acquire(im_seg);
7694 im_seg->s_rdmacnt--;
7695 if (im_seg->s_rdmacnt == 0) {
7696 cv_broadcast(&im_seg->s_cv);
7698 rsmseglock_release(im_seg);
7699 error = sgio_resid_copyout(arg, &sg_io, mode);
7700 out:
7701 iovec = iovec_start;
7702 ka_iovec = ka_iovec_start;
7703 for (i = 0; i < iov_proc; i++) {
7704 if (ka_iovec->io_type == RSM_HANDLE_TYPE) {
7705 ex_seg = rsmexport_lookup(ka_iovec->local.segid);
7707 ASSERT(ex_seg != NULL);
7708 ASSERT(ex_seg->s_state == RSM_STATE_EXPORT);
7710 ex_seg->s_rdmacnt--; /* unrefcnt the handle */
7711 if (ex_seg->s_rdmacnt == 0) {
7712 cv_broadcast(&ex_seg->s_cv);
7714 rsmseglock_release(ex_seg);
7717 ASSERT(iovec != NULL); /* true if iov_proc > 0 */
7720 * At present there is no dependency on the existence of xbufs
7721 * created by ddi_umem_iosetup for each of the iovecs. So we
7722 * can these xbufs here.
7724 if (iovec->local_mem.ms_type == RSM_MEM_BUF) {
7725 freerbuf(iovec->local_mem.ms_memory.bp);
7728 iovec++;
7729 ka_iovec++;
7732 if (sg_io.io_request_count > RSM_MAX_IOVLEN) {
7733 if (iovec_start)
7734 kmem_free(iovec_start, size);
7735 kmem_free(ka_iovec_start, ka_size);
7738 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7739 "rsm_iovec_ioctl done %d\n", e));
7740 /* if RSMPI call fails return that else return copyout's retval */
7741 return ((e != RSM_SUCCESS) ? e : error);
7746 static int
7747 rsmaddr_ioctl(int cmd, rsm_ioctlmsg_t *msg, int mode)
7749 adapter_t *adapter;
7750 rsm_addr_t addr;
7751 rsm_node_id_t node;
7752 int rval = DDI_SUCCESS;
7753 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_IOCTL);
7755 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmaddr_ioctl enter\n"));
7757 adapter = rsm_getadapter(msg, mode);
7758 if (adapter == NULL) {
7759 DBG_PRINTF((category, RSM_DEBUG,
7760 "rsmaddr_ioctl done: adapter not found\n"));
7761 return (RSMERR_CTLR_NOT_PRESENT);
7764 switch (cmd) {
7765 case RSM_IOCTL_MAP_TO_ADDR: /* nodeid to hwaddr mapping */
7766 /* returns the hwaddr in msg->hwaddr */
7767 if (msg->nodeid == my_nodeid) {
7768 msg->hwaddr = adapter->hwaddr;
7769 } else {
7770 addr = get_remote_hwaddr(adapter, msg->nodeid);
7771 if ((int64_t)addr < 0) {
7772 rval = RSMERR_INTERNAL_ERROR;
7773 } else {
7774 msg->hwaddr = addr;
7777 break;
7778 case RSM_IOCTL_MAP_TO_NODEID: /* hwaddr to nodeid mapping */
7779 /* returns the nodeid in msg->nodeid */
7780 if (msg->hwaddr == adapter->hwaddr) {
7781 msg->nodeid = my_nodeid;
7782 } else {
7783 node = get_remote_nodeid(adapter, msg->hwaddr);
7784 if ((int)node < 0) {
7785 rval = RSMERR_INTERNAL_ERROR;
7786 } else {
7787 msg->nodeid = (rsm_node_id_t)node;
7790 break;
7791 default:
7792 rval = EINVAL;
7793 break;
7796 rsmka_release_adapter(adapter);
7797 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7798 "rsmaddr_ioctl done: %d\n", rval));
7799 return (rval);
7802 static int
7803 rsm_ddi_copyin(caddr_t arg, rsm_ioctlmsg_t *msg, int mode)
7805 DBG_DEFINE(category,
7806 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_IOCTL | RSM_DDI);
7808 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_ddi_copyin enter\n"));
7810 #ifdef _MULTI_DATAMODEL
7812 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7813 rsm_ioctlmsg32_t msg32;
7814 int i;
7816 if (ddi_copyin(arg, (caddr_t)&msg32, sizeof (msg32), mode)) {
7817 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7818 "rsm_ddi_copyin done: EFAULT\n"));
7819 return (RSMERR_BAD_ADDR);
7821 msg->len = msg32.len;
7822 msg->vaddr = (caddr_t)(uintptr_t)msg32.vaddr;
7823 msg->arg = (caddr_t)(uintptr_t)msg32.arg;
7824 msg->key = msg32.key;
7825 msg->acl_len = msg32.acl_len;
7826 msg->acl = (rsmapi_access_entry_t *)(uintptr_t)msg32.acl;
7827 msg->cnum = msg32.cnum;
7828 msg->cname = (caddr_t)(uintptr_t)msg32.cname;
7829 msg->cname_len = msg32.cname_len;
7830 msg->nodeid = msg32.nodeid;
7831 msg->hwaddr = msg32.hwaddr;
7832 msg->perm = msg32.perm;
7833 for (i = 0; i < 4; i++) {
7834 msg->bar.comp[i].u64 = msg32.bar.comp[i].u64;
7836 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7837 "rsm_ddi_copyin done\n"));
7838 return (RSM_SUCCESS);
7840 #endif
7841 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_ddi_copyin done\n"));
7842 if (ddi_copyin(arg, (caddr_t)msg, sizeof (*msg), mode))
7843 return (RSMERR_BAD_ADDR);
7844 else
7845 return (RSM_SUCCESS);
7848 static int
7849 rsmattr_ddi_copyout(adapter_t *adapter, caddr_t arg, int mode)
7851 rsmka_int_controller_attr_t rsm_cattr;
7852 DBG_DEFINE(category,
7853 RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_IOCTL | RSM_DDI);
7855 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7856 "rsmattr_ddi_copyout enter\n"));
7858 * need to copy appropriate data from rsm_controller_attr_t
7859 * to rsmka_int_controller_attr_t
7861 #ifdef _MULTI_DATAMODEL
7862 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7863 rsmka_int_controller_attr32_t rsm_cattr32;
7865 rsm_cattr32.attr_direct_access_sizes =
7866 adapter->rsm_attr.attr_direct_access_sizes;
7867 rsm_cattr32.attr_atomic_sizes =
7868 adapter->rsm_attr.attr_atomic_sizes;
7869 rsm_cattr32.attr_page_size =
7870 adapter->rsm_attr.attr_page_size;
7871 if (adapter->rsm_attr.attr_max_export_segment_size >
7872 UINT_MAX)
7873 rsm_cattr32.attr_max_export_segment_size =
7874 RSM_MAXSZ_PAGE_ALIGNED;
7875 else
7876 rsm_cattr32.attr_max_export_segment_size =
7877 adapter->rsm_attr.attr_max_export_segment_size;
7878 if (adapter->rsm_attr.attr_tot_export_segment_size >
7879 UINT_MAX)
7880 rsm_cattr32.attr_tot_export_segment_size =
7881 RSM_MAXSZ_PAGE_ALIGNED;
7882 else
7883 rsm_cattr32.attr_tot_export_segment_size =
7884 adapter->rsm_attr.attr_tot_export_segment_size;
7885 if (adapter->rsm_attr.attr_max_export_segments >
7886 UINT_MAX)
7887 rsm_cattr32.attr_max_export_segments =
7888 UINT_MAX;
7889 else
7890 rsm_cattr32.attr_max_export_segments =
7891 adapter->rsm_attr.attr_max_export_segments;
7892 if (adapter->rsm_attr.attr_max_import_map_size >
7893 UINT_MAX)
7894 rsm_cattr32.attr_max_import_map_size =
7895 RSM_MAXSZ_PAGE_ALIGNED;
7896 else
7897 rsm_cattr32.attr_max_import_map_size =
7898 adapter->rsm_attr.attr_max_import_map_size;
7899 if (adapter->rsm_attr.attr_tot_import_map_size >
7900 UINT_MAX)
7901 rsm_cattr32.attr_tot_import_map_size =
7902 RSM_MAXSZ_PAGE_ALIGNED;
7903 else
7904 rsm_cattr32.attr_tot_import_map_size =
7905 adapter->rsm_attr.attr_tot_import_map_size;
7906 if (adapter->rsm_attr.attr_max_import_segments >
7907 UINT_MAX)
7908 rsm_cattr32.attr_max_import_segments =
7909 UINT_MAX;
7910 else
7911 rsm_cattr32.attr_max_import_segments =
7912 adapter->rsm_attr.attr_max_import_segments;
7913 rsm_cattr32.attr_controller_addr =
7914 adapter->rsm_attr.attr_controller_addr;
7916 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7917 "rsmattr_ddi_copyout done\n"));
7918 if (ddi_copyout((caddr_t)&rsm_cattr32, arg,
7919 sizeof (rsmka_int_controller_attr32_t), mode)) {
7920 return (RSMERR_BAD_ADDR);
7922 else
7923 return (RSM_SUCCESS);
7925 #endif
7926 rsm_cattr.attr_direct_access_sizes =
7927 adapter->rsm_attr.attr_direct_access_sizes;
7928 rsm_cattr.attr_atomic_sizes =
7929 adapter->rsm_attr.attr_atomic_sizes;
7930 rsm_cattr.attr_page_size =
7931 adapter->rsm_attr.attr_page_size;
7932 rsm_cattr.attr_max_export_segment_size =
7933 adapter->rsm_attr.attr_max_export_segment_size;
7934 rsm_cattr.attr_tot_export_segment_size =
7935 adapter->rsm_attr.attr_tot_export_segment_size;
7936 rsm_cattr.attr_max_export_segments =
7937 adapter->rsm_attr.attr_max_export_segments;
7938 rsm_cattr.attr_max_import_map_size =
7939 adapter->rsm_attr.attr_max_import_map_size;
7940 rsm_cattr.attr_tot_import_map_size =
7941 adapter->rsm_attr.attr_tot_import_map_size;
7942 rsm_cattr.attr_max_import_segments =
7943 adapter->rsm_attr.attr_max_import_segments;
7944 rsm_cattr.attr_controller_addr =
7945 adapter->rsm_attr.attr_controller_addr;
7946 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7947 "rsmattr_ddi_copyout done\n"));
7948 if (ddi_copyout((caddr_t)&rsm_cattr, arg,
7949 sizeof (rsmka_int_controller_attr_t), mode)) {
7950 return (RSMERR_BAD_ADDR);
7952 else
7953 return (RSM_SUCCESS);
7956 /*ARGSUSED*/
7957 static int
7958 rsm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
7959 int *rvalp)
7961 rsmseg_t *seg;
7962 rsmresource_t *res;
7963 minor_t rnum;
7964 rsm_ioctlmsg_t msg = {0};
7965 int error;
7966 adapter_t *adapter;
7967 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_IOCTL);
7969 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_ioctl enter\n"));
7971 if (cmd == RSM_IOCTL_CONSUMEEVENT) {
7972 error = rsm_consumeevent_ioctl((caddr_t)arg, mode);
7973 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7974 "rsm_ioctl RSM_IOCTL_CONSUMEEVENT done: %d\n", error));
7975 return (error);
7978 /* topology cmd does not use the arg common to other cmds */
7979 if (RSM_IOCTL_CMDGRP(cmd) == RSM_IOCTL_TOPOLOGY) {
7980 error = rsmka_topology_ioctl((caddr_t)arg, cmd, mode);
7981 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7982 "rsm_ioctl done: %d\n", error));
7983 return (error);
7986 if (RSM_IOCTL_CMDGRP(cmd) == RSM_IOCTL_IOVEC) {
7987 error = rsm_iovec_ioctl(dev, (caddr_t)arg, cmd, mode, credp);
7988 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7989 "rsm_ioctl done: %d\n", error));
7990 return (error);
7994 * try to load arguments
7996 if (cmd != RSM_IOCTL_RING_BELL &&
7997 rsm_ddi_copyin((caddr_t)arg, &msg, mode)) {
7998 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7999 "rsm_ioctl done: EFAULT\n"));
8000 return (RSMERR_BAD_ADDR);
8003 if (cmd == RSM_IOCTL_ATTR) {
8004 adapter = rsm_getadapter(&msg, mode);
8005 if (adapter == NULL) {
8006 DBG_PRINTF((category, RSM_DEBUG,
8007 "rsm_ioctl done: ENODEV\n"));
8008 return (RSMERR_CTLR_NOT_PRESENT);
8010 error = rsmattr_ddi_copyout(adapter, msg.arg, mode);
8011 rsmka_release_adapter(adapter);
8012 DBG_PRINTF((category, RSM_DEBUG,
8013 "rsm_ioctl:after copyout %d\n", error));
8014 return (error);
8017 if (cmd == RSM_IOCTL_BAR_INFO) {
8018 /* Return library off,len of barrier page */
8019 msg.off = barrier_offset;
8020 msg.len = (int)barrier_size;
8021 #ifdef _MULTI_DATAMODEL
8022 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
8023 rsm_ioctlmsg32_t msg32;
8025 if (msg.len > UINT_MAX)
8026 msg.len = RSM_MAXSZ_PAGE_ALIGNED;
8027 else
8028 msg32.len = (int32_t)msg.len;
8029 msg32.off = (int32_t)msg.off;
8030 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8031 "rsm_ioctl done\n"));
8032 if (ddi_copyout((caddr_t)&msg32, (caddr_t)arg,
8033 sizeof (msg32), mode))
8034 return (RSMERR_BAD_ADDR);
8035 else
8036 return (RSM_SUCCESS);
8038 #endif
8039 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8040 "rsm_ioctl done\n"));
8041 if (ddi_copyout((caddr_t)&msg, (caddr_t)arg,
8042 sizeof (msg), mode))
8043 return (RSMERR_BAD_ADDR);
8044 else
8045 return (RSM_SUCCESS);
8048 if (RSM_IOCTL_CMDGRP(cmd) == RSM_IOCTL_MAP_ADDR) {
8049 /* map the nodeid or hwaddr */
8050 error = rsmaddr_ioctl(cmd, &msg, mode);
8051 if (error == RSM_SUCCESS) {
8052 #ifdef _MULTI_DATAMODEL
8053 if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
8054 rsm_ioctlmsg32_t msg32;
8056 msg32.hwaddr = (uint64_t)msg.hwaddr;
8057 msg32.nodeid = (uint32_t)msg.nodeid;
8059 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8060 "rsm_ioctl done\n"));
8061 if (ddi_copyout((caddr_t)&msg32, (caddr_t)arg,
8062 sizeof (msg32), mode))
8063 return (RSMERR_BAD_ADDR);
8064 else
8065 return (RSM_SUCCESS);
8067 #endif
8068 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8069 "rsm_ioctl done\n"));
8070 if (ddi_copyout((caddr_t)&msg, (caddr_t)arg,
8071 sizeof (msg), mode))
8072 return (RSMERR_BAD_ADDR);
8073 else
8074 return (RSM_SUCCESS);
8076 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8077 "rsm_ioctl done: %d\n", error));
8078 return (error);
8081 /* Find resource and look it in read mode */
8082 rnum = getminor(dev);
8083 res = rsmresource_lookup(rnum, RSM_NOLOCK);
8084 ASSERT(res != NULL);
8087 * Find command group
8089 switch (RSM_IOCTL_CMDGRP(cmd)) {
8090 case RSM_IOCTL_EXPORT_SEG:
8092 * Export list is searched during publish, loopback and
8093 * remote lookup call.
8095 seg = rsmresource_seg(res, rnum, credp,
8096 RSM_RESOURCE_EXPORT_SEGMENT);
8097 if (seg->s_type == RSM_RESOURCE_EXPORT_SEGMENT) {
8098 error = rsmexport_ioctl(seg, &msg, cmd, arg, mode,
8099 credp);
8100 } else { /* export ioctl on an import/barrier resource */
8101 error = RSMERR_BAD_SEG_HNDL;
8103 break;
8104 case RSM_IOCTL_IMPORT_SEG:
8105 /* Import list is searched during remote unmap call. */
8106 seg = rsmresource_seg(res, rnum, credp,
8107 RSM_RESOURCE_IMPORT_SEGMENT);
8108 if (seg->s_type == RSM_RESOURCE_IMPORT_SEGMENT) {
8109 error = rsmimport_ioctl(seg, &msg, cmd, arg, mode,
8110 credp);
8111 } else { /* import ioctl on an export/barrier resource */
8112 error = RSMERR_BAD_SEG_HNDL;
8114 break;
8115 case RSM_IOCTL_BAR:
8116 if (res != RSMRC_RESERVED &&
8117 res->rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT) {
8118 error = rsmbar_ioctl((rsmseg_t *)res, &msg, cmd, arg,
8119 mode);
8120 } else { /* invalid res value */
8121 error = RSMERR_BAD_SEG_HNDL;
8123 break;
8124 case RSM_IOCTL_BELL:
8125 if (res != RSMRC_RESERVED) {
8126 if (res->rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT)
8127 error = exportbell_ioctl((rsmseg_t *)res, cmd);
8128 else if (res->rsmrc_type == RSM_RESOURCE_EXPORT_SEGMENT)
8129 error = importbell_ioctl((rsmseg_t *)res, cmd);
8130 else /* RSM_RESOURCE_BAR */
8131 error = RSMERR_BAD_SEG_HNDL;
8132 } else { /* invalid res value */
8133 error = RSMERR_BAD_SEG_HNDL;
8135 break;
8136 default:
8137 error = EINVAL;
8140 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_ioctl done: %d\n",
8141 error));
8142 return (error);
8146 /* **************************** Segment Mapping Operations ********* */
8147 static rsm_mapinfo_t *
8148 rsm_get_mapinfo(rsmseg_t *seg, off_t off, size_t len, off_t *dev_offset,
8149 size_t *map_len)
8151 rsm_mapinfo_t *p;
8153 * Find the correct mapinfo structure to use during the mapping
8154 * from the seg->s_mapinfo list.
8155 * The seg->s_mapinfo list contains in reverse order the mappings
8156 * as returned by the RSMPI rsm_map. In rsm_devmap, we need to
8157 * access the correct entry within this list for the mapping
8158 * requested.
8160 * The algorithm for selecting a list entry is as follows:
8162 * When start_offset of an entry <= off we have found the entry
8163 * we were looking for. Adjust the dev_offset and map_len (needs
8164 * to be PAGESIZE aligned).
8166 p = seg->s_mapinfo;
8167 for (; p; p = p->next) {
8168 if (p->start_offset <= off) {
8169 *dev_offset = p->dev_offset + off - p->start_offset;
8170 *map_len = (len > p->individual_len) ?
8171 p->individual_len : ptob(btopr(len));
8172 return (p);
8174 p = p->next;
8177 return (NULL);
8180 static void
8181 rsm_free_mapinfo(rsm_mapinfo_t *mapinfo)
8183 rsm_mapinfo_t *p;
8185 while (mapinfo != NULL) {
8186 p = mapinfo;
8187 mapinfo = mapinfo->next;
8188 kmem_free(p, sizeof (*p));
8192 static int
8193 rsmmap_map(devmap_cookie_t dhp, dev_t dev, uint_t flags, offset_t off,
8194 size_t len, void **pvtp)
8196 rsmcookie_t *p;
8197 rsmresource_t *res;
8198 rsmseg_t *seg;
8199 minor_t rnum;
8200 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8202 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_map enter\n"));
8204 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8205 "rsmmap_map: dhp = %x\n", dhp));
8207 flags = flags;
8209 rnum = getminor(dev);
8210 res = (rsmresource_t *)rsmresource_lookup(rnum, RSM_NOLOCK);
8211 ASSERT(res != NULL);
8213 seg = (rsmseg_t *)res;
8215 rsmseglock_acquire(seg);
8217 ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
8220 * Allocate structure and add cookie to segment list
8222 p = kmem_alloc(sizeof (*p), KM_SLEEP);
8224 p->c_dhp = dhp;
8225 p->c_off = off;
8226 p->c_len = len;
8227 p->c_next = seg->s_ckl;
8228 seg->s_ckl = p;
8230 *pvtp = (void *)seg;
8232 rsmseglock_release(seg);
8234 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_map done\n"));
8235 return (DDI_SUCCESS);
8239 * Page fault handling is done here. The prerequisite mapping setup
8240 * has been done in rsm_devmap with calls to ddi_devmem_setup or
8241 * ddi_umem_setup
8243 static int
8244 rsmmap_access(devmap_cookie_t dhp, void *pvt, offset_t offset, size_t len,
8245 uint_t type, uint_t rw)
8247 int e;
8248 rsmseg_t *seg = (rsmseg_t *)pvt;
8249 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8251 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_access enter\n"));
8253 rsmseglock_acquire(seg);
8255 ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
8257 while (seg->s_state == RSM_STATE_MAP_QUIESCE) {
8258 if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
8259 DBG_PRINTF((category, RSM_DEBUG,
8260 "rsmmap_access done: cv_wait INTR"));
8261 rsmseglock_release(seg);
8262 return (RSMERR_INTERRUPTED);
8266 ASSERT(seg->s_state == RSM_STATE_DISCONNECT ||
8267 seg->s_state == RSM_STATE_ACTIVE);
8269 if (seg->s_state == RSM_STATE_DISCONNECT)
8270 seg->s_flags |= RSM_IMPORT_DUMMY;
8272 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8273 "rsmmap_access: dhp = %x\n", dhp));
8275 rsmseglock_release(seg);
8277 if (e = devmap_load(dhp, offset, len, type, rw)) {
8278 DBG_PRINTF((category, RSM_ERR, "devmap_load failed\n"));
8282 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_access done\n"));
8284 return (e);
8287 static int
8288 rsmmap_dup(devmap_cookie_t dhp, void *oldpvt, devmap_cookie_t new_dhp,
8289 void **newpvt)
8291 rsmseg_t *seg = (rsmseg_t *)oldpvt;
8292 rsmcookie_t *p, *old;
8293 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8295 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_dup enter\n"));
8298 * Same as map, create an entry to hold cookie and add it to
8299 * connect segment list. The oldpvt is a pointer to segment.
8300 * Return segment pointer in newpvt.
8302 rsmseglock_acquire(seg);
8304 ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
8307 * Find old cookie
8309 for (old = seg->s_ckl; old != NULL; old = old->c_next) {
8310 if (old->c_dhp == dhp) {
8311 break;
8314 if (old == NULL) {
8315 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8316 "rsmmap_dup done: EINVAL\n"));
8317 rsmseglock_release(seg);
8318 return (EINVAL);
8321 p = kmem_alloc(sizeof (*p), KM_SLEEP);
8323 p->c_dhp = new_dhp;
8324 p->c_off = old->c_off;
8325 p->c_len = old->c_len;
8326 p->c_next = seg->s_ckl;
8327 seg->s_ckl = p;
8329 *newpvt = (void *)seg;
8331 rsmseglock_release(seg);
8333 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_dup done\n"));
8335 return (DDI_SUCCESS);
8338 static void
8339 rsmmap_unmap(devmap_cookie_t dhp, void *pvtp, offset_t off, size_t len,
8340 devmap_cookie_t new_dhp1, void **pvtp1,
8341 devmap_cookie_t new_dhp2, void **pvtp2)
8344 * Remove pvtp structure from segment list.
8346 rsmseg_t *seg = (rsmseg_t *)pvtp;
8347 int freeflag;
8349 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8351 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_unmap enter\n"));
8353 off = off; len = len;
8354 pvtp1 = pvtp1; pvtp2 = pvtp2;
8356 rsmseglock_acquire(seg);
8358 ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
8360 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8361 "rsmmap_unmap: dhp = %x\n", dhp));
8363 * We can go ahead and remove the dhps even if we are in
8364 * the MAPPING state because the dhps being removed here
8365 * belong to a different mmap and we are holding the segment
8366 * lock.
8368 if (new_dhp1 == NULL && new_dhp2 == NULL) {
8369 /* find and remove dhp handle */
8370 rsmcookie_t *tmp, **back = &seg->s_ckl;
8372 while (*back != NULL) {
8373 tmp = *back;
8374 if (tmp->c_dhp == dhp) {
8375 *back = tmp->c_next;
8376 kmem_free(tmp, sizeof (*tmp));
8377 break;
8379 back = &tmp->c_next;
8381 } else {
8382 DBG_PRINTF((category, RSM_DEBUG_LVL2,
8383 "rsmmap_unmap:parital unmap"
8384 "new_dhp1 %lx, new_dhp2 %lx\n",
8385 (size_t)new_dhp1, (size_t)new_dhp2));
8389 * rsmmap_unmap is called for each mapping cookie on the list.
8390 * When the list becomes empty and we are not in the MAPPING
8391 * state then unmap in the rsmpi driver.
8393 if ((seg->s_ckl == NULL) && (seg->s_state != RSM_STATE_MAPPING))
8394 (void) rsm_unmap(seg);
8396 if (seg->s_state == RSM_STATE_END && seg->s_ckl == NULL) {
8397 freeflag = 1;
8398 } else {
8399 freeflag = 0;
8402 rsmseglock_release(seg);
8404 if (freeflag) {
8405 /* Free the segment structure */
8406 rsmseg_free(seg);
8408 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_unmap done\n"));
8412 static struct devmap_callback_ctl rsmmap_ops = {
8413 DEVMAP_OPS_REV, /* devmap_ops version number */
8414 rsmmap_map, /* devmap_ops map routine */
8415 rsmmap_access, /* devmap_ops access routine */
8416 rsmmap_dup, /* devmap_ops dup routine */
8417 rsmmap_unmap, /* devmap_ops unmap routine */
8420 static int
8421 rsm_devmap(dev_t dev, devmap_cookie_t dhc, offset_t off, size_t len,
8422 size_t *maplen, uint_t model /*ARGSUSED*/)
8424 struct devmap_callback_ctl *callbackops = &rsmmap_ops;
8425 int err;
8426 uint_t maxprot;
8427 minor_t rnum;
8428 rsmseg_t *seg;
8429 off_t dev_offset;
8430 size_t cur_len;
8431 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8433 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_devmap enter\n"));
8435 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8436 "rsm_devmap: off = %lx, len = %lx\n", off, len));
8437 rnum = getminor(dev);
8438 seg = (rsmseg_t *)rsmresource_lookup(rnum, RSM_NOLOCK);
8439 ASSERT(seg != NULL);
8441 if (seg->s_hdr.rsmrc_type == RSM_RESOURCE_BAR) {
8442 if ((off == barrier_offset) &&
8443 (len == barrier_size)) {
8445 ASSERT(bar_va != NULL && bar_cookie != NULL);
8448 * The offset argument in devmap_umem_setup represents
8449 * the offset within the kernel memory defined by the
8450 * cookie. We use this offset as barrier_offset.
8452 err = devmap_umem_setup(dhc, rsm_dip, NULL, bar_cookie,
8453 barrier_offset, len, PROT_USER|PROT_READ,
8454 DEVMAP_DEFAULTS, 0);
8456 if (err != 0) {
8457 DBG_PRINTF((category, RSM_ERR,
8458 "rsm_devmap done: %d\n", err));
8459 return (RSMERR_MAP_FAILED);
8461 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8462 "rsm_devmap done: %d\n", err));
8464 *maplen = barrier_size;
8466 return (err);
8467 } else {
8468 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8469 "rsm_devmap done: %d\n", err));
8470 return (RSMERR_MAP_FAILED);
8474 ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
8475 ASSERT(seg->s_state == RSM_STATE_MAPPING);
8478 * Make sure we still have permission for the map operation.
8480 maxprot = PROT_USER;
8481 if (seg->s_mode & RSM_PERM_READ) {
8482 maxprot |= PROT_READ;
8485 if (seg->s_mode & RSM_PERM_WRITE) {
8486 maxprot |= PROT_WRITE;
8490 * For each devmap call, rsmmap_map is called. This maintains driver
8491 * private information for the mapping. Thus, if there are multiple
8492 * devmap calls there will be multiple rsmmap_map calls and for each
8493 * call, the mapping information will be stored.
8494 * In case of an error during the processing of the devmap call, error
8495 * will be returned. This error return causes the caller of rsm_devmap
8496 * to undo all the mappings by calling rsmmap_unmap for each one.
8497 * rsmmap_unmap will free up the private information for the requested
8498 * mapping.
8500 if (seg->s_node != my_nodeid) {
8501 rsm_mapinfo_t *p;
8503 p = rsm_get_mapinfo(seg, off, len, &dev_offset, &cur_len);
8504 if (p == NULL) {
8505 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8506 "rsm_devmap: incorrect mapping info\n"));
8507 return (RSMERR_MAP_FAILED);
8509 err = devmap_devmem_setup(dhc, p->dip,
8510 callbackops, p->dev_register,
8511 dev_offset, cur_len, maxprot,
8512 DEVMAP_ALLOW_REMAP | DEVMAP_DEFAULTS, 0);
8514 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8515 "rsm_devmap: dip=%lx,dreg=%lu,doff=%lx,"
8516 "off=%lx,len=%lx\n",
8517 p->dip, p->dev_register, dev_offset, off, cur_len));
8519 if (err != 0) {
8520 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8521 "rsm_devmap: devmap_devmem_setup failed %d\n",
8522 err));
8523 return (RSMERR_MAP_FAILED);
8525 /* cur_len is always an integral multiple pagesize */
8526 ASSERT((cur_len & (PAGESIZE-1)) == 0);
8527 *maplen = cur_len;
8528 return (err);
8530 } else {
8531 err = devmap_umem_setup(dhc, rsm_dip, callbackops,
8532 seg->s_cookie, off, len, maxprot,
8533 DEVMAP_ALLOW_REMAP|DEVMAP_DEFAULTS, 0);
8534 if (err != 0) {
8535 DBG_PRINTF((category, RSM_DEBUG,
8536 "rsm_devmap: devmap_umem_setup failed %d\n",
8537 err));
8538 return (RSMERR_MAP_FAILED);
8540 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8541 "rsm_devmap: loopback done\n"));
8543 *maplen = ptob(btopr(len));
8545 return (err);
8550 * We can use the devmap framework for mapping device memory to user space by
8551 * specifying this routine in the rsm_cb_ops structure. The kernel mmap
8552 * processing calls this entry point and devmap_setup is called within this
8553 * function, which eventually calls rsm_devmap
8555 static int
8556 rsm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
8557 uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
8559 int error = 0;
8560 int old_state;
8561 minor_t rnum;
8562 rsmseg_t *seg, *eseg;
8563 adapter_t *adapter;
8564 rsm_import_share_t *sharedp;
8565 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8567 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_segmap enter\n"));
8570 * find segment
8572 rnum = getminor(dev);
8573 seg = (rsmseg_t *)rsmresource_lookup(rnum, RSM_LOCK);
8575 if (seg == NULL) {
8576 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8577 "rsm_segmap done: invalid segment\n"));
8578 return (EINVAL);
8582 * the user is trying to map a resource that has not been
8583 * defined yet. The library uses this to map in the
8584 * barrier page.
8586 if (seg->s_hdr.rsmrc_type == RSM_RESOURCE_BAR) {
8587 rsmseglock_release(seg);
8590 * The mapping for the barrier page is identified
8591 * by the special offset barrier_offset
8594 if (off == (off_t)barrier_offset ||
8595 len == (off_t)barrier_size) {
8596 if (bar_cookie == NULL || bar_va == NULL) {
8597 DBG_PRINTF((category, RSM_DEBUG,
8598 "rsm_segmap: bar cookie/va is NULL\n"));
8599 return (EINVAL);
8602 error = devmap_setup(dev, (offset_t)off, as, addrp,
8603 (size_t)len, prot, maxprot, flags, cred);
8605 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8606 "rsm_segmap done: %d\n", error));
8607 return (error);
8608 } else {
8609 DBG_PRINTF((category, RSM_DEBUG,
8610 "rsm_segmap: bad offset/length\n"));
8611 return (EINVAL);
8615 /* Make sure you can only map imported segments */
8616 if (seg->s_hdr.rsmrc_type != RSM_RESOURCE_IMPORT_SEGMENT) {
8617 rsmseglock_release(seg);
8618 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8619 "rsm_segmap done: not an import segment\n"));
8620 return (EINVAL);
8622 /* check means library is broken */
8623 ASSERT(seg->s_hdr.rsmrc_num == rnum);
8625 /* wait for the segment to become unquiesced */
8626 while (seg->s_state == RSM_STATE_CONN_QUIESCE) {
8627 if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
8628 rsmseglock_release(seg);
8629 DBG_PRINTF((category, RSM_DEBUG,
8630 "rsm_segmap done: cv_wait INTR"));
8631 return (ENODEV);
8635 /* wait until segment leaves the mapping state */
8636 while (seg->s_state == RSM_STATE_MAPPING)
8637 cv_wait(&seg->s_cv, &seg->s_lock);
8640 * we allow multiple maps of the same segment in the KA
8641 * and it works because we do an rsmpi map of the whole
8642 * segment during the first map and all the device mapping
8643 * information needed in rsm_devmap is in the mapinfo list.
8645 if ((seg->s_state != RSM_STATE_CONNECT) &&
8646 (seg->s_state != RSM_STATE_ACTIVE)) {
8647 rsmseglock_release(seg);
8648 DBG_PRINTF((category, RSM_DEBUG,
8649 "rsm_segmap done: segment not connected\n"));
8650 return (ENODEV);
8654 * Make sure we are not mapping a larger segment than what's
8655 * exported
8657 if ((size_t)off + ptob(btopr(len)) > seg->s_len) {
8658 rsmseglock_release(seg);
8659 DBG_PRINTF((category, RSM_DEBUG,
8660 "rsm_segmap done: off+len>seg size\n"));
8661 return (ENXIO);
8665 * Make sure we still have permission for the map operation.
8667 maxprot = PROT_USER;
8668 if (seg->s_mode & RSM_PERM_READ) {
8669 maxprot |= PROT_READ;
8672 if (seg->s_mode & RSM_PERM_WRITE) {
8673 maxprot |= PROT_WRITE;
8676 if ((prot & maxprot) != prot) {
8677 /* No permission */
8678 rsmseglock_release(seg);
8679 DBG_PRINTF((category, RSM_DEBUG,
8680 "rsm_segmap done: no permission\n"));
8681 return (EACCES);
8684 old_state = seg->s_state;
8686 ASSERT(seg->s_share != NULL);
8688 rsmsharelock_acquire(seg);
8690 sharedp = seg->s_share;
8692 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8693 "rsm_segmap:RSMSI_STATE=%d\n", sharedp->rsmsi_state));
8695 if ((sharedp->rsmsi_state != RSMSI_STATE_CONNECTED) &&
8696 (sharedp->rsmsi_state != RSMSI_STATE_MAPPED)) {
8697 rsmsharelock_release(seg);
8698 rsmseglock_release(seg);
8699 DBG_PRINTF((category, RSM_DEBUG,
8700 "rsm_segmap done:RSMSI_STATE %d invalid\n",
8701 sharedp->rsmsi_state));
8702 return (ENODEV);
8706 * Do the map - since we want importers to share mappings
8707 * we do the rsmpi map for the whole segment
8709 if (seg->s_node != my_nodeid) {
8710 uint_t dev_register;
8711 off_t dev_offset;
8712 dev_info_t *dip;
8713 size_t tmp_len;
8714 size_t total_length_mapped = 0;
8715 size_t length_to_map = seg->s_len;
8716 off_t tmp_off = 0;
8717 rsm_mapinfo_t *p;
8720 * length_to_map = seg->s_len is always an integral
8721 * multiple of PAGESIZE. Length mapped in each entry in mapinfo
8722 * list is a multiple of PAGESIZE - RSMPI map ensures this
8725 adapter = seg->s_adapter;
8726 ASSERT(sharedp->rsmsi_state == RSMSI_STATE_CONNECTED ||
8727 sharedp->rsmsi_state == RSMSI_STATE_MAPPED);
8729 if (sharedp->rsmsi_state == RSMSI_STATE_CONNECTED) {
8730 error = 0;
8731 /* map the whole segment */
8732 while (total_length_mapped < seg->s_len) {
8733 tmp_len = 0;
8735 error = adapter->rsmpi_ops->rsm_map(
8736 seg->s_handle.in, tmp_off,
8737 length_to_map, &tmp_len,
8738 &dip, &dev_register, &dev_offset,
8739 NULL, NULL);
8741 if (error != 0)
8742 break;
8745 * Store the mapping info obtained from rsm_map
8747 p = kmem_alloc(sizeof (*p), KM_SLEEP);
8748 p->dev_register = dev_register;
8749 p->dev_offset = dev_offset;
8750 p->dip = dip;
8751 p->individual_len = tmp_len;
8752 p->start_offset = tmp_off;
8753 p->next = sharedp->rsmsi_mapinfo;
8754 sharedp->rsmsi_mapinfo = p;
8756 total_length_mapped += tmp_len;
8757 length_to_map -= tmp_len;
8758 tmp_off += tmp_len;
8760 seg->s_mapinfo = sharedp->rsmsi_mapinfo;
8762 if (error != RSM_SUCCESS) {
8763 /* Check if this is the the first rsm_map */
8764 if (sharedp->rsmsi_mapinfo != NULL) {
8766 * A single rsm_unmap undoes
8767 * multiple rsm_maps.
8769 (void) seg->s_adapter->rsmpi_ops->
8770 rsm_unmap(sharedp->rsmsi_handle);
8771 rsm_free_mapinfo(sharedp->
8772 rsmsi_mapinfo);
8774 sharedp->rsmsi_mapinfo = NULL;
8775 sharedp->rsmsi_state = RSMSI_STATE_CONNECTED;
8776 rsmsharelock_release(seg);
8777 rsmseglock_release(seg);
8778 DBG_PRINTF((category, RSM_DEBUG,
8779 "rsm_segmap done: rsmpi map err %d\n",
8780 error));
8781 ASSERT(error != RSMERR_BAD_LENGTH &&
8782 error != RSMERR_BAD_MEM_ALIGNMENT &&
8783 error != RSMERR_BAD_SEG_HNDL);
8784 if (error == RSMERR_UNSUPPORTED_OPERATION)
8785 return (ENOTSUP);
8786 else if (error == RSMERR_INSUFFICIENT_RESOURCES)
8787 return (EAGAIN);
8788 else if (error == RSMERR_CONN_ABORTED)
8789 return (ENODEV);
8790 else
8791 return (error);
8792 } else {
8793 sharedp->rsmsi_state = RSMSI_STATE_MAPPED;
8795 } else {
8796 seg->s_mapinfo = sharedp->rsmsi_mapinfo;
8799 sharedp->rsmsi_mapcnt++;
8801 rsmsharelock_release(seg);
8803 /* move to an intermediate mapping state */
8804 seg->s_state = RSM_STATE_MAPPING;
8805 rsmseglock_release(seg);
8807 error = devmap_setup(dev, (offset_t)off, as, addrp,
8808 len, prot, maxprot, flags, cred);
8810 rsmseglock_acquire(seg);
8811 ASSERT(seg->s_state == RSM_STATE_MAPPING);
8813 if (error == DDI_SUCCESS) {
8814 seg->s_state = RSM_STATE_ACTIVE;
8815 } else {
8816 rsmsharelock_acquire(seg);
8818 ASSERT(sharedp->rsmsi_state == RSMSI_STATE_MAPPED);
8820 sharedp->rsmsi_mapcnt--;
8821 if (sharedp->rsmsi_mapcnt == 0) {
8822 /* unmap the shared RSMPI mapping */
8823 ASSERT(sharedp->rsmsi_handle != NULL);
8824 (void) adapter->rsmpi_ops->
8825 rsm_unmap(sharedp->rsmsi_handle);
8826 rsm_free_mapinfo(sharedp->rsmsi_mapinfo);
8827 sharedp->rsmsi_mapinfo = NULL;
8828 sharedp->rsmsi_state = RSMSI_STATE_CONNECTED;
8831 rsmsharelock_release(seg);
8832 seg->s_state = old_state;
8833 DBG_PRINTF((category, RSM_ERR,
8834 "rsm: devmap_setup failed %d\n", error));
8836 cv_broadcast(&seg->s_cv);
8837 rsmseglock_release(seg);
8838 DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsm_segmap done: %d\n",
8839 error));
8840 return (error);
8841 } else {
8843 * For loopback, the export segment mapping cookie (s_cookie)
8844 * is also used as the s_cookie value for its import segments
8845 * during mapping.
8846 * Note that reference counting for s_cookie of the export
8847 * segment is not required due to the following:
8848 * We never have a case of the export segment being destroyed,
8849 * leaving the import segments with a stale value for the
8850 * s_cookie field, since a force disconnect is done prior to a
8851 * destroy of an export segment. The force disconnect causes
8852 * the s_cookie value to be reset to NULL. Also for the
8853 * rsm_rebind operation, we change the s_cookie value of the
8854 * export segment as well as of all its local (loopback)
8855 * importers.
8857 DBG_ADDCATEGORY(category, RSM_LOOPBACK);
8859 rsmsharelock_release(seg);
8861 * In order to maintain the lock ordering between the export
8862 * and import segment locks, we need to acquire the export
8863 * segment lock first and only then acquire the import
8864 * segment lock.
8865 * The above is necessary to avoid any deadlock scenarios
8866 * with rsm_rebind which also acquires both the export
8867 * and import segment locks in the above mentioned order.
8868 * Based on code inspection, there seem to be no other
8869 * situations in which both the export and import segment
8870 * locks are acquired either in the same or opposite order
8871 * as mentioned above.
8872 * Thus in order to conform to the above lock order, we
8873 * need to change the state of the import segment to
8874 * RSM_STATE_MAPPING, release the lock. Once this is done we
8875 * can now safely acquire the export segment lock first
8876 * followed by the import segment lock which is as per
8877 * the lock order mentioned above.
8879 /* move to an intermediate mapping state */
8880 seg->s_state = RSM_STATE_MAPPING;
8881 rsmseglock_release(seg);
8883 eseg = rsmexport_lookup(seg->s_key);
8885 if (eseg == NULL) {
8886 rsmseglock_acquire(seg);
8888 * Revert to old_state and signal any waiters
8889 * The shared state is not changed
8892 seg->s_state = old_state;
8893 cv_broadcast(&seg->s_cv);
8894 rsmseglock_release(seg);
8895 DBG_PRINTF((category, RSM_DEBUG,
8896 "rsm_segmap done: key %d not found\n", seg->s_key));
8897 return (ENODEV);
8900 rsmsharelock_acquire(seg);
8901 ASSERT(sharedp->rsmsi_state == RSMSI_STATE_CONNECTED ||
8902 sharedp->rsmsi_state == RSMSI_STATE_MAPPED);
8904 sharedp->rsmsi_mapcnt++;
8905 sharedp->rsmsi_state = RSMSI_STATE_MAPPED;
8906 rsmsharelock_release(seg);
8908 ASSERT(eseg->s_cookie != NULL);
8911 * It is not required or necessary to acquire the import
8912 * segment lock here to change the value of s_cookie since
8913 * no one will touch the import segment as long as it is
8914 * in the RSM_STATE_MAPPING state.
8916 seg->s_cookie = eseg->s_cookie;
8918 rsmseglock_release(eseg);
8920 error = devmap_setup(dev, (offset_t)off, as, addrp, (size_t)len,
8921 prot, maxprot, flags, cred);
8923 rsmseglock_acquire(seg);
8924 ASSERT(seg->s_state == RSM_STATE_MAPPING);
8925 if (error == 0) {
8926 seg->s_state = RSM_STATE_ACTIVE;
8927 } else {
8928 rsmsharelock_acquire(seg);
8930 ASSERT(sharedp->rsmsi_state == RSMSI_STATE_MAPPED);
8932 sharedp->rsmsi_mapcnt--;
8933 if (sharedp->rsmsi_mapcnt == 0) {
8934 sharedp->rsmsi_mapinfo = NULL;
8935 sharedp->rsmsi_state = RSMSI_STATE_CONNECTED;
8937 rsmsharelock_release(seg);
8938 seg->s_state = old_state;
8939 seg->s_cookie = NULL;
8941 cv_broadcast(&seg->s_cv);
8942 rsmseglock_release(seg);
8943 DBG_PRINTF((category, RSM_DEBUG_LVL2,
8944 "rsm_segmap done: %d\n", error));
8945 return (error);
8950 rsmka_null_seg_create(
8951 rsm_controller_handle_t argcp,
8952 rsm_memseg_export_handle_t *handle,
8953 size_t size,
8954 uint_t flags,
8955 rsm_memory_local_t *memory,
8956 rsm_resource_callback_t callback,
8957 rsm_resource_callback_arg_t callback_arg /*ARGSUSED*/)
8959 return (RSM_SUCCESS);
8964 rsmka_null_seg_destroy(
8965 rsm_memseg_export_handle_t argmemseg /*ARGSUSED*/)
8967 return (RSM_SUCCESS);
8972 rsmka_null_bind(
8973 rsm_memseg_export_handle_t argmemseg,
8974 off_t offset,
8975 rsm_memory_local_t *argmemory,
8976 rsm_resource_callback_t callback,
8977 rsm_resource_callback_arg_t callback_arg /*ARGSUSED*/)
8979 return (RSM_SUCCESS);
8984 rsmka_null_unbind(
8985 rsm_memseg_export_handle_t argmemseg,
8986 off_t offset,
8987 size_t length /*ARGSUSED*/)
8989 return (DDI_SUCCESS);
8993 rsmka_null_rebind(
8994 rsm_memseg_export_handle_t argmemseg,
8995 off_t offset,
8996 rsm_memory_local_t *memory,
8997 rsm_resource_callback_t callback,
8998 rsm_resource_callback_arg_t callback_arg /*ARGSUSED*/)
9000 return (RSM_SUCCESS);
9004 rsmka_null_publish(
9005 rsm_memseg_export_handle_t argmemseg,
9006 rsm_access_entry_t access_list[],
9007 uint_t access_list_length,
9008 rsm_memseg_id_t segment_id,
9009 rsm_resource_callback_t callback,
9010 rsm_resource_callback_arg_t callback_arg /*ARGSUSED*/)
9012 return (RSM_SUCCESS);
9017 rsmka_null_republish(
9018 rsm_memseg_export_handle_t memseg,
9019 rsm_access_entry_t access_list[],
9020 uint_t access_list_length,
9021 rsm_resource_callback_t callback,
9022 rsm_resource_callback_arg_t callback_arg /*ARGSUSED*/)
9024 return (RSM_SUCCESS);
9028 rsmka_null_unpublish(
9029 rsm_memseg_export_handle_t argmemseg /*ARGSUSED*/)
9031 return (RSM_SUCCESS);
9035 void
9036 rsmka_init_loopback()
9038 rsm_ops_t *ops = &null_rsmpi_ops;
9039 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_LOOPBACK);
9041 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9042 "rsmka_init_loopback enter\n"));
9044 /* initialize null ops vector */
9045 ops->rsm_seg_create = rsmka_null_seg_create;
9046 ops->rsm_seg_destroy = rsmka_null_seg_destroy;
9047 ops->rsm_bind = rsmka_null_bind;
9048 ops->rsm_unbind = rsmka_null_unbind;
9049 ops->rsm_rebind = rsmka_null_rebind;
9050 ops->rsm_publish = rsmka_null_publish;
9051 ops->rsm_unpublish = rsmka_null_unpublish;
9052 ops->rsm_republish = rsmka_null_republish;
9054 /* initialize attributes for loopback adapter */
9055 loopback_attr.attr_name = loopback_str;
9056 loopback_attr.attr_page_size = 0x8; /* 8K */
9058 /* initialize loopback adapter */
9059 loopback_adapter.rsm_attr = loopback_attr;
9060 loopback_adapter.rsmpi_ops = &null_rsmpi_ops;
9061 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9062 "rsmka_init_loopback done\n"));
9065 /* ************** DR functions ********************************** */
9066 static void
9067 rsm_quiesce_exp_seg(rsmresource_t *resp)
9069 int recheck_state;
9070 rsmseg_t *segp = (rsmseg_t *)resp;
9071 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9072 DBG_DEFINE_STR(function, "rsm_unquiesce_exp_seg");
9074 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9075 "%s enter: key=%u\n", function, segp->s_key));
9077 rsmseglock_acquire(segp);
9078 do {
9079 recheck_state = 0;
9080 if ((segp->s_state == RSM_STATE_NEW_QUIESCED) ||
9081 (segp->s_state == RSM_STATE_BIND_QUIESCED) ||
9082 (segp->s_state == RSM_STATE_EXPORT_QUIESCING) ||
9083 (segp->s_state == RSM_STATE_EXPORT_QUIESCED)) {
9084 rsmseglock_release(segp);
9085 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9086 "%s done:state =%d\n", function,
9087 segp->s_state));
9088 return;
9091 if (segp->s_state == RSM_STATE_NEW) {
9092 segp->s_state = RSM_STATE_NEW_QUIESCED;
9093 rsmseglock_release(segp);
9094 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9095 "%s done:state =%d\n", function,
9096 segp->s_state));
9097 return;
9100 if (segp->s_state == RSM_STATE_BIND) {
9101 /* unbind */
9102 (void) rsm_unbind_pages(segp);
9103 segp->s_state = RSM_STATE_BIND_QUIESCED;
9104 rsmseglock_release(segp);
9105 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9106 "%s done:state =%d\n", function,
9107 segp->s_state));
9108 return;
9111 if (segp->s_state == RSM_STATE_EXPORT) {
9113 * wait for putv/getv to complete if the segp is
9114 * a local memory handle
9116 while ((segp->s_state == RSM_STATE_EXPORT) &&
9117 (segp->s_rdmacnt != 0)) {
9118 cv_wait(&segp->s_cv, &segp->s_lock);
9121 if (segp->s_state != RSM_STATE_EXPORT) {
9123 * state changed need to see what it
9124 * should be changed to.
9126 recheck_state = 1;
9127 continue;
9130 segp->s_state = RSM_STATE_EXPORT_QUIESCING;
9131 rsmseglock_release(segp);
9133 * send SUSPEND messages - currently it will be
9134 * done at the end
9136 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9137 "%s done:state =%d\n", function,
9138 segp->s_state));
9139 return;
9141 } while (recheck_state);
9143 rsmseglock_release(segp);
9146 static void
9147 rsm_unquiesce_exp_seg(rsmresource_t *resp)
9149 int ret;
9150 rsmseg_t *segp = (rsmseg_t *)resp;
9151 rsmapi_access_entry_t *acl;
9152 rsm_access_entry_t *rsmpi_acl;
9153 int acl_len;
9154 int create_flags = 0;
9155 struct buf *xbuf;
9156 rsm_memory_local_t mem;
9157 adapter_t *adapter;
9158 dev_t sdev = 0;
9159 rsm_resource_callback_t callback_flag;
9160 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9161 DBG_DEFINE_STR(function, "rsm_unquiesce_exp_seg");
9163 rsmseglock_acquire(segp);
9165 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9166 "%s enter: key=%u, state=%d\n", function, segp->s_key,
9167 segp->s_state));
9169 if ((segp->s_state == RSM_STATE_NEW) ||
9170 (segp->s_state == RSM_STATE_BIND) ||
9171 (segp->s_state == RSM_STATE_EXPORT)) {
9172 rsmseglock_release(segp);
9173 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done:state=%d\n",
9174 function, segp->s_state));
9175 return;
9178 if (segp->s_state == RSM_STATE_NEW_QUIESCED) {
9179 segp->s_state = RSM_STATE_NEW;
9180 cv_broadcast(&segp->s_cv);
9181 rsmseglock_release(segp);
9182 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done:state=%d\n",
9183 function, segp->s_state));
9184 return;
9187 if (segp->s_state == RSM_STATE_BIND_QUIESCED) {
9188 /* bind the segment */
9189 ret = rsm_bind_pages(&segp->s_cookie, segp->s_region.r_vaddr,
9190 segp->s_len, segp->s_proc);
9191 if (ret == RSM_SUCCESS) { /* bind successful */
9192 segp->s_state = RSM_STATE_BIND;
9193 } else { /* bind failed - resource unavailable */
9194 segp->s_state = RSM_STATE_NEW;
9196 cv_broadcast(&segp->s_cv);
9197 rsmseglock_release(segp);
9198 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9199 "%s done: bind_qscd bind = %d\n", function, ret));
9200 return;
9203 while (segp->s_state == RSM_STATE_EXPORT_QUIESCING) {
9204 /* wait for the segment to move to EXPORT_QUIESCED state */
9205 cv_wait(&segp->s_cv, &segp->s_lock);
9208 if (segp->s_state == RSM_STATE_EXPORT_QUIESCED) {
9209 /* bind the segment */
9210 ret = rsm_bind_pages(&segp->s_cookie, segp->s_region.r_vaddr,
9211 segp->s_len, segp->s_proc);
9213 if (ret != RSM_SUCCESS) {
9214 /* bind failed - resource unavailable */
9215 acl_len = segp->s_acl_len;
9216 acl = segp->s_acl;
9217 rsmpi_acl = segp->s_acl_in;
9218 segp->s_acl_len = 0;
9219 segp->s_acl = NULL;
9220 segp->s_acl_in = NULL;
9221 rsmseglock_release(segp);
9223 rsmexport_rm(segp);
9224 rsmacl_free(acl, acl_len);
9225 rsmpiacl_free(rsmpi_acl, acl_len);
9227 rsmseglock_acquire(segp);
9228 segp->s_state = RSM_STATE_NEW;
9229 cv_broadcast(&segp->s_cv);
9230 rsmseglock_release(segp);
9231 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9232 "%s done: exp_qscd bind failed = %d\n",
9233 function, ret));
9234 return;
9237 * publish the segment
9238 * if successful
9239 * segp->s_state = RSM_STATE_EXPORT;
9240 * else failed
9241 * segp->s_state = RSM_STATE_BIND;
9244 /* check whether it is a local_memory_handle */
9245 if (segp->s_acl != (rsmapi_access_entry_t *)NULL) {
9246 if ((segp->s_acl[0].ae_node == my_nodeid) &&
9247 (segp->s_acl[0].ae_permission == 0)) {
9248 segp->s_state = RSM_STATE_EXPORT;
9249 cv_broadcast(&segp->s_cv);
9250 rsmseglock_release(segp);
9251 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9252 "%s done:exp_qscd\n", function));
9253 return;
9256 xbuf = ddi_umem_iosetup(segp->s_cookie, 0, segp->s_len, B_WRITE,
9257 sdev, 0, NULL, DDI_UMEM_SLEEP);
9258 ASSERT(xbuf != NULL);
9260 mem.ms_type = RSM_MEM_BUF;
9261 mem.ms_bp = xbuf;
9263 adapter = segp->s_adapter;
9265 if (segp->s_flags & RSMKA_ALLOW_UNBIND_REBIND) {
9266 create_flags = RSM_ALLOW_UNBIND_REBIND;
9269 if (segp->s_flags & RSMKA_SET_RESOURCE_DONTWAIT) {
9270 callback_flag = RSM_RESOURCE_DONTWAIT;
9271 } else {
9272 callback_flag = RSM_RESOURCE_SLEEP;
9275 ret = adapter->rsmpi_ops->rsm_seg_create(
9276 adapter->rsmpi_handle, &segp->s_handle.out,
9277 segp->s_len, create_flags, &mem,
9278 callback_flag, NULL);
9280 if (ret != RSM_SUCCESS) {
9281 acl_len = segp->s_acl_len;
9282 acl = segp->s_acl;
9283 rsmpi_acl = segp->s_acl_in;
9284 segp->s_acl_len = 0;
9285 segp->s_acl = NULL;
9286 segp->s_acl_in = NULL;
9287 rsmseglock_release(segp);
9289 rsmexport_rm(segp);
9290 rsmacl_free(acl, acl_len);
9291 rsmpiacl_free(rsmpi_acl, acl_len);
9293 rsmseglock_acquire(segp);
9294 segp->s_state = RSM_STATE_BIND;
9295 cv_broadcast(&segp->s_cv);
9296 rsmseglock_release(segp);
9297 DBG_PRINTF((category, RSM_ERR,
9298 "%s done: exp_qscd create failed = %d\n",
9299 function, ret));
9300 return;
9303 ret = adapter->rsmpi_ops->rsm_publish(
9304 segp->s_handle.out, segp->s_acl_in, segp->s_acl_len,
9305 segp->s_segid, RSM_RESOURCE_DONTWAIT, NULL);
9307 if (ret != RSM_SUCCESS) {
9308 acl_len = segp->s_acl_len;
9309 acl = segp->s_acl;
9310 rsmpi_acl = segp->s_acl_in;
9311 segp->s_acl_len = 0;
9312 segp->s_acl = NULL;
9313 segp->s_acl_in = NULL;
9314 adapter->rsmpi_ops->rsm_seg_destroy(segp->s_handle.out);
9315 rsmseglock_release(segp);
9317 rsmexport_rm(segp);
9318 rsmacl_free(acl, acl_len);
9319 rsmpiacl_free(rsmpi_acl, acl_len);
9321 rsmseglock_acquire(segp);
9322 segp->s_state = RSM_STATE_BIND;
9323 cv_broadcast(&segp->s_cv);
9324 rsmseglock_release(segp);
9325 DBG_PRINTF((category, RSM_ERR,
9326 "%s done: exp_qscd publish failed = %d\n",
9327 function, ret));
9328 return;
9331 segp->s_state = RSM_STATE_EXPORT;
9332 cv_broadcast(&segp->s_cv);
9333 rsmseglock_release(segp);
9334 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done: exp_qscd\n",
9335 function));
9336 return;
9339 rsmseglock_release(segp);
9341 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done\n", function));
9344 static void
9345 rsm_quiesce_imp_seg(rsmresource_t *resp)
9347 rsmseg_t *segp = (rsmseg_t *)resp;
9348 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9349 DBG_DEFINE_STR(function, "rsm_quiesce_imp_seg");
9351 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9352 "%s enter: key=%u\n", function, segp->s_key));
9354 rsmseglock_acquire(segp);
9355 segp->s_flags |= RSM_DR_INPROGRESS;
9357 while (segp->s_rdmacnt != 0) {
9358 /* wait for the RDMA to complete */
9359 cv_wait(&segp->s_cv, &segp->s_lock);
9362 rsmseglock_release(segp);
9364 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done\n", function));
9368 static void
9369 rsm_unquiesce_imp_seg(rsmresource_t *resp)
9371 rsmseg_t *segp = (rsmseg_t *)resp;
9372 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9373 DBG_DEFINE_STR(function, "rsm_unquiesce_imp_seg");
9375 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9376 "%s enter: key=%u\n", function, segp->s_key));
9378 rsmseglock_acquire(segp);
9380 segp->s_flags &= ~RSM_DR_INPROGRESS;
9381 /* wake up any waiting putv/getv ops */
9382 cv_broadcast(&segp->s_cv);
9384 rsmseglock_release(segp);
9386 DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done\n", function));
9391 static void
9392 rsm_process_exp_seg(rsmresource_t *resp, int event)
9394 if (event == RSM_DR_QUIESCE)
9395 rsm_quiesce_exp_seg(resp);
9396 else /* UNQUIESCE */
9397 rsm_unquiesce_exp_seg(resp);
9400 static void
9401 rsm_process_imp_seg(rsmresource_t *resp, int event)
9403 if (event == RSM_DR_QUIESCE)
9404 rsm_quiesce_imp_seg(resp);
9405 else /* UNQUIESCE */
9406 rsm_unquiesce_imp_seg(resp);
9409 static void
9410 rsm_dr_process_local_segments(int event)
9413 int i, j;
9414 rsmresource_blk_t *blk;
9415 rsmresource_t *p;
9416 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9418 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9419 "rsm_dr_process_local_segments enter\n"));
9421 /* iterate through the resource structure */
9423 rw_enter(&rsm_resource.rsmrc_lock, RW_READER);
9425 for (i = 0; i < rsm_resource.rsmrc_len; i++) {
9426 blk = rsm_resource.rsmrc_root[i];
9427 if (blk != NULL) {
9428 for (j = 0; j < RSMRC_BLKSZ; j++) {
9429 p = blk->rsmrcblk_blks[j];
9430 if ((p != NULL) && (p != RSMRC_RESERVED)) {
9431 /* valid resource */
9432 if (p->rsmrc_type ==
9433 RSM_RESOURCE_EXPORT_SEGMENT)
9434 rsm_process_exp_seg(p, event);
9435 else if (p->rsmrc_type ==
9436 RSM_RESOURCE_IMPORT_SEGMENT)
9437 rsm_process_imp_seg(p, event);
9443 rw_exit(&rsm_resource.rsmrc_lock);
9445 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9446 "rsm_dr_process_local_segments done\n"));
9449 /* *************** DR callback functions ************ */
9450 static void
9451 rsm_dr_callback_post_add(void *arg, pgcnt_t delta /* ARGSUSED */)
9453 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9454 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9455 "rsm_dr_callback_post_add is a no-op\n"));
9456 /* Noop */
9459 static int
9460 rsm_dr_callback_pre_del(void *arg, pgcnt_t delta /* ARGSUSED */)
9462 int recheck_state = 0;
9463 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9465 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9466 "rsm_dr_callback_pre_del enter\n"));
9468 mutex_enter(&rsm_drv_data.drv_lock);
9470 do {
9471 recheck_state = 0;
9472 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9473 "rsm_dr_callback_pre_del:state=%d\n",
9474 rsm_drv_data.drv_state));
9476 switch (rsm_drv_data.drv_state) {
9477 case RSM_DRV_NEW:
9479 * The state should usually never be RSM_DRV_NEW
9480 * since in this state the callbacks have not yet
9481 * been registered. So, ASSERT.
9483 ASSERT(0);
9484 return (0);
9485 case RSM_DRV_REG_PROCESSING:
9487 * The driver is in the process of registering
9488 * with the DR framework. So, wait till the
9489 * registration process is complete.
9491 recheck_state = 1;
9492 cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
9493 break;
9494 case RSM_DRV_UNREG_PROCESSING:
9496 * If the state is RSM_DRV_UNREG_PROCESSING, the
9497 * module is in the process of detaching and
9498 * unregistering the callbacks from the DR
9499 * framework. So, simply return.
9501 mutex_exit(&rsm_drv_data.drv_lock);
9502 DBG_PRINTF((category, RSM_DEBUG,
9503 "rsm_dr_callback_pre_del:"
9504 "pre-del on NEW/UNREG\n"));
9505 return (0);
9506 case RSM_DRV_OK:
9507 rsm_drv_data.drv_state = RSM_DRV_PREDEL_STARTED;
9508 break;
9509 case RSM_DRV_PREDEL_STARTED:
9510 /* FALLTHRU */
9511 case RSM_DRV_PREDEL_COMPLETED:
9512 /* FALLTHRU */
9513 case RSM_DRV_POSTDEL_IN_PROGRESS:
9514 recheck_state = 1;
9515 cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
9516 break;
9517 case RSM_DRV_DR_IN_PROGRESS:
9518 rsm_drv_data.drv_memdel_cnt++;
9519 mutex_exit(&rsm_drv_data.drv_lock);
9520 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9521 "rsm_dr_callback_pre_del done\n"));
9522 return (0);
9523 /* break; */
9524 default:
9525 ASSERT(0);
9526 break;
9529 } while (recheck_state);
9531 rsm_drv_data.drv_memdel_cnt++;
9533 mutex_exit(&rsm_drv_data.drv_lock);
9535 /* Do all the quiescing stuff here */
9536 DBG_PRINTF((category, RSM_DEBUG,
9537 "rsm_dr_callback_pre_del: quiesce things now\n"));
9539 rsm_dr_process_local_segments(RSM_DR_QUIESCE);
9542 * now that all local segments have been quiesced lets inform
9543 * the importers
9545 rsm_send_suspend();
9548 * In response to the suspend message the remote node(s) will process
9549 * the segments and send a suspend_complete message. Till all
9550 * the nodes send the suspend_complete message we wait in the
9551 * RSM_DRV_PREDEL_STARTED state. In the exporter_quiesce
9552 * function we transition to the RSM_DRV_PREDEL_COMPLETED state.
9554 mutex_enter(&rsm_drv_data.drv_lock);
9556 while (rsm_drv_data.drv_state == RSM_DRV_PREDEL_STARTED) {
9557 cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
9560 ASSERT(rsm_drv_data.drv_state == RSM_DRV_PREDEL_COMPLETED);
9562 rsm_drv_data.drv_state = RSM_DRV_DR_IN_PROGRESS;
9563 cv_broadcast(&rsm_drv_data.drv_cv);
9565 mutex_exit(&rsm_drv_data.drv_lock);
9567 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9568 "rsm_dr_callback_pre_del done\n"));
9570 return (0);
9573 static void
9574 rsm_dr_callback_post_del(void *arg, pgcnt_t delta, int cancelled /* ARGSUSED */)
9576 int recheck_state = 0;
9577 DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9579 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9580 "rsm_dr_callback_post_del enter\n"));
9582 mutex_enter(&rsm_drv_data.drv_lock);
9584 do {
9585 recheck_state = 0;
9586 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9587 "rsm_dr_callback_post_del:state=%d\n",
9588 rsm_drv_data.drv_state));
9590 switch (rsm_drv_data.drv_state) {
9591 case RSM_DRV_NEW:
9593 * The driver state cannot not be RSM_DRV_NEW
9594 * since in this state the callbacks have not
9595 * yet been registered.
9597 ASSERT(0);
9598 return;
9599 case RSM_DRV_REG_PROCESSING:
9601 * The driver is in the process of registering with
9602 * the DR framework. Wait till the registration is
9603 * complete.
9605 recheck_state = 1;
9606 cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
9607 break;
9608 case RSM_DRV_UNREG_PROCESSING:
9610 * RSM_DRV_UNREG_PROCESSING state means the module
9611 * is detaching and unregistering the callbacks
9612 * from the DR framework. So simply return.
9614 /* FALLTHRU */
9615 case RSM_DRV_OK:
9617 * RSM_DRV_OK means we missed the pre-del
9618 * corresponding to this post-del coz we had not
9619 * registered yet, so simply return.
9621 mutex_exit(&rsm_drv_data.drv_lock);
9622 DBG_PRINTF((category, RSM_DEBUG,
9623 "rsm_dr_callback_post_del:"
9624 "post-del on OK/UNREG\n"));
9625 return;
9626 /* break; */
9627 case RSM_DRV_PREDEL_STARTED:
9628 /* FALLTHRU */
9629 case RSM_DRV_PREDEL_COMPLETED:
9630 /* FALLTHRU */
9631 case RSM_DRV_POSTDEL_IN_PROGRESS:
9632 recheck_state = 1;
9633 cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
9634 break;
9635 case RSM_DRV_DR_IN_PROGRESS:
9636 rsm_drv_data.drv_memdel_cnt--;
9637 if (rsm_drv_data.drv_memdel_cnt > 0) {
9638 mutex_exit(&rsm_drv_data.drv_lock);
9639 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9640 "rsm_dr_callback_post_del done:\n"));
9641 return;
9643 rsm_drv_data.drv_state = RSM_DRV_POSTDEL_IN_PROGRESS;
9644 break;
9645 default:
9646 ASSERT(0);
9647 return;
9648 /* break; */
9650 } while (recheck_state);
9652 mutex_exit(&rsm_drv_data.drv_lock);
9654 /* Do all the unquiescing stuff here */
9655 DBG_PRINTF((category, RSM_DEBUG,
9656 "rsm_dr_callback_post_del: unquiesce things now\n"));
9658 rsm_dr_process_local_segments(RSM_DR_UNQUIESCE);
9661 * now that all local segments have been unquiesced lets inform
9662 * the importers
9664 rsm_send_resume();
9666 mutex_enter(&rsm_drv_data.drv_lock);
9668 rsm_drv_data.drv_state = RSM_DRV_OK;
9670 cv_broadcast(&rsm_drv_data.drv_cv);
9672 mutex_exit(&rsm_drv_data.drv_lock);
9674 DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9675 "rsm_dr_callback_post_del done\n"));
9677 return;