usr/src/cmd/rcm_daemon/common/rcm_lock.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  *
  22  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27
  28 #include "rcm_impl.h"
  29 #include "rcm_module.h"
  30
  31 /*
  32  * Global locks
  33  */
  34 mutex_t rcm_req_lock;   /* protects global dr & info request list */
  35
  36 /*
  37  * Daemon state file
  38  */
  39 static int state_fd;
  40 #define RCM_STATE_FILE  "/var/run/rcm_daemon_state"
  41 #define N_REQ_CHUNK     10      /* grow 10 entries at a time */
  42
  43 /*
  44  * Daemon timeout value
  45  */
  46 #define RCM_DAEMON_TIMEOUT      300     /* 5 minutes idle time */
  47
  48 /*
  49  * Struct for a list of outstanding rcm requests
  50  */
  51 typedef struct {
  52         int     seq_num;                /* sequence number of request */
  53         int     state;                  /* current state */
  54         pid_t   pid;                    /* pid of initiator */
  55         uint_t  flag;                   /* request flags */
  56         int     type;                   /* resource(device) type */
  57         timespec_t interval;            /* suspend interval */
  58         char    device[MAXPATHLEN];     /* name of device or resource */
  59 } req_t;
  60
  61 typedef struct {
  62         int     n_req;
  63         int     n_req_max;      /* number of req_t's to follow */
  64         int     n_seq_max;      /* last sequence number */
  65         int     idle_timeout;   /* persist idle timeout value */
  66         req_t   req[1];
  67         /* more req_t follows */
  68 } req_list_t;
  69
  70 static req_list_t *dr_req_list;
  71 static req_list_t *info_req_list;
  72
  73 static const char *locked_info = "DR operation in progress";
  74 static const char *locked_err = "Resource is busy";
  75
  76 static int rcmd_get_state();
  77 static void add_to_polling_list(pid_t);
  78 static void remove_from_polling_list(pid_t);
  79
  80 void start_polling_thread();
  81 static void stop_polling_thread();
  82
  83 /*
  84  * Initialize request lists required for locking
  85  */
  86 void
  87 rcmd_lock_init(void)
  88 {
  89         int size;
  90         struct stat fbuf;
  91
  92         /*
  93          * Start info list with one slot, then grow on demand.
  94          */
  95         info_req_list = s_calloc(1, sizeof (req_list_t));
  96         info_req_list->n_req_max = 1;
  97
  98         /*
  99          * Open daemon state file and map in contents
 100          */
 101         state_fd = open(RCM_STATE_FILE, O_CREAT|O_RDWR, 0600);
 102         if (state_fd == -1) {
 103                 rcm_log_message(RCM_ERROR, gettext("cannot open %s: %s\n"),
 104                     RCM_STATE_FILE, strerror(errno));
 105                 rcmd_exit(errno);
 106         }
 107
 108         if (fstat(state_fd, &fbuf) != 0) {
 109                 rcm_log_message(RCM_ERROR, gettext("cannot stat %s: %s\n"),
 110                     RCM_STATE_FILE, strerror(errno));
 111                 rcmd_exit(errno);
 112         }
 113
 114         size = fbuf.st_size;
 115         if (size == 0) {
 116                 size = sizeof (req_list_t);
 117                 if (ftruncate(state_fd, size) != 0) {
 118                         rcm_log_message(RCM_ERROR,
 119                             gettext("cannot truncate %s: %s\n"),
 120                             RCM_STATE_FILE, strerror(errno));
 121                         rcmd_exit(errno);
 122                 }
 123         }
 124
 125         /*LINTED*/
 126         dr_req_list = (req_list_t *)mmap(NULL, size, PROT_READ|PROT_WRITE,
 127             MAP_SHARED, state_fd, 0);
 128         if (dr_req_list == MAP_FAILED) {
 129                 rcm_log_message(RCM_ERROR, gettext("cannot mmap %s: %s\n"),
 130                     RCM_STATE_FILE, strerror(errno));
 131                 rcmd_exit(errno);
 132         }
 133
 134         /*
 135          * Initial size is one entry
 136          */
 137         if (dr_req_list->n_req_max == 0) {
 138                 dr_req_list->n_req_max = 1;
 139                 (void) fsync(state_fd);
 140                 return;
 141         }
 142
 143         rcm_log_message(RCM_DEBUG, "n_req = %d, n_req_max = %d\n",
 144             dr_req_list->n_req, dr_req_list->n_req_max);
 145
 146         /*
 147          * Recover the daemon state
 148          */
 149         clean_dr_list();
 150 }
 151
 152 /*
 153  * Get a unique sequence number--to be called with rcm_req_lock held.
 154  */
 155 static int
 156 get_seq_number()
 157 {
 158         int number;
 159
 160         if (dr_req_list == NULL)
 161                 return (0);
 162
 163         dr_req_list->n_seq_max++;
 164         number  = (dr_req_list->n_seq_max << SEQ_NUM_SHIFT);
 165         (void) fsync(state_fd);
 166
 167         return (number);
 168 }
 169
 170 /*
 171  * Find entry in list with the same resource name and sequence number.
 172  * If seq_num == -1, no seq_num matching is required.
 173  */
 174 static req_t *
 175 find_req_entry(char *device, uint_t flag, int seq_num, req_list_t *list)
 176 {
 177         int i;
 178
 179         /*
 180          * Look for entry with the same resource and seq_num.
 181          * Also match RCM_FILESYS field in flag.
 182          */
 183         for (i = 0; i < list->n_req_max; i++) {
 184                 if (list->req[i].state == RCM_STATE_REMOVE)
 185                         /* stale entry */
 186                         continue;
 187                 /*
 188                  * We need to distiguish a file system root from the directory
 189                  * it is mounted on.
 190                  *
 191                  * Applications are not aware of any difference between the
 192                  * two, but the system keeps track of it internally by
 193                  * checking for mount points while traversing file path.
 194                  * In a similar spirit, RCM is keeping this difference as
 195                  * an implementation detail.
 196                  */
 197                 if ((strcmp(device, list->req[i].device) != 0) ||
 198                     (list->req[i].flag & RCM_FILESYS) != (flag & RCM_FILESYS))
 199                         /* different resource */
 200                         continue;
 201
 202                 if ((seq_num != -1) && ((seq_num >> SEQ_NUM_SHIFT) !=
 203                     (list->req[i].seq_num >> SEQ_NUM_SHIFT)))
 204                         /* different base seqnum */
 205                         continue;
 206
 207                 return (&list->req[i]);
 208         }
 209
 210         return (NULL);
 211 }
 212
 213 /*
 214  * Get the next empty req_t entry. If no entry exists, grow the list.
 215  */
 216 static req_t *
 217 get_req_entry(req_list_t **listp)
 218 {
 219         int i;
 220         int n_req = (*listp)->n_req;
 221         int n_req_max = (*listp)->n_req_max;
 222
 223         /*
 224          * If the list is full, grow the list and return the first
 225          * entry in the new portion.
 226          */
 227         if (n_req == n_req_max) {
 228                 int newsize;
 229
 230                 n_req_max += N_REQ_CHUNK;
 231                 newsize = sizeof (req_list_t) + (n_req_max - 1) *
 232                     sizeof (req_t);
 233
 234                 if (listp == &info_req_list) {
 235                         *listp = s_realloc(*listp, newsize);
 236                 } else if (ftruncate(state_fd, newsize) != 0) {
 237                         rcm_log_message(RCM_ERROR,
 238                             gettext("cannot truncate %s: %s\n"),
 239                             RCM_STATE_FILE, strerror(errno));
 240                         rcmd_exit(errno);
 241                 /*LINTED*/
 242                 } else if ((*listp = (req_list_t *)mmap(NULL, newsize,
 243                     PROT_READ|PROT_WRITE, MAP_SHARED, state_fd, 0)) ==
 244                     MAP_FAILED) {
 245                         rcm_log_message(RCM_ERROR,
 246                             gettext("cannot mmap %s: %s\n"),
 247                             RCM_STATE_FILE, strerror(errno));
 248                         rcmd_exit(errno);
 249                 }
 250
 251                 /* Initialize the new entries */
 252                 for (i = (*listp)->n_req_max; i < n_req_max; i++) {
 253                         (*listp)->req[i].state = RCM_STATE_REMOVE;
 254                         (void) strcpy((*listp)->req[i].device, "");
 255                 }
 256
 257                 (*listp)->n_req_max = n_req_max;
 258                 (*listp)->n_req++;
 259                 return (&(*listp)->req[n_req]);
 260         }
 261
 262         /*
 263          * List contains empty slots, find it.
 264          */
 265         for (i = 0; i < n_req_max; i++) {
 266                 if (((*listp)->req[i].device[0] == '\0') ||
 267                     ((*listp)->req[i].state == RCM_STATE_REMOVE)) {
 268                         break;
 269                 }
 270         }
 271
 272         assert(i < n_req_max);  /* empty slot must exist */
 273
 274         (*listp)->n_req++;
 275         return (&(*listp)->req[i]);
 276 }
 277
 278 /*
 279  * When one resource depends on multiple resources, it's possible that
 280  * rcm_get_info can be called multiple times on the resource, resulting
 281  * in duplicate information. By assigning a unique sequence number to
 282  * each rcm_get_info operation, this duplication can be eliminated.
 283  *
 284  * Insert a dr entry in info_req_list
 285  */
 286 int
 287 info_req_add(char *rsrcname, uint_t flag, int seq_num)
 288 {
 289         int error = 0;
 290         char *device;
 291         req_t *req;
 292
 293         rcm_log_message(RCM_TRACE2, "info_req_add(%s, %d)\n",
 294             rsrcname, seq_num);
 295
 296         device = resolve_name(rsrcname);
 297         (void) mutex_lock(&rcm_req_lock);
 298
 299         /*
 300          * Look for entry with the same resource and seq_num.
 301          * If it exists, we return an error so that such
 302          * information is not gathered more than once.
 303          */
 304         if (find_req_entry(device, flag, seq_num, info_req_list) != NULL) {
 305                 rcm_log_message(RCM_DEBUG, "getinfo cycle: %s %d \n",
 306                     device, seq_num);
 307                 error = -1;
 308                 goto out;
 309         }
 310
 311         /*
 312          * Get empty entry and fill in seq_num and device.
 313          */
 314         req = get_req_entry(&info_req_list);
 315         req->seq_num = seq_num;
 316         req->state = RCM_STATE_ONLINE;  /* mark that the entry is in use */
 317         req->flag = flag;
 318         (void) strcpy(req->device, device);
 319
 320 out:
 321         (void) mutex_unlock(&rcm_req_lock);
 322         free(device);
 323
 324         return (error);
 325 }
 326
 327 /*
 328  * Remove all entries associated with seq_num from info_req_list
 329  */
 330 void
 331 info_req_remove(int seq_num)
 332 {
 333         int i;
 334
 335         rcm_log_message(RCM_TRACE3, "info_req_remove(%d)\n", seq_num);
 336
 337         seq_num >>= SEQ_NUM_SHIFT;
 338         (void) mutex_lock(&rcm_req_lock);
 339
 340         /* remove all entries with seq_num */
 341         for (i = 0; i < info_req_list->n_req_max; i++) {
 342                 if (info_req_list->req[i].state == RCM_STATE_REMOVE)
 343                         continue;
 344
 345                 if ((info_req_list->req[i].seq_num >> SEQ_NUM_SHIFT) != seq_num)
 346                         continue;
 347
 348                 info_req_list->req[i].state = RCM_STATE_REMOVE;
 349                 info_req_list->n_req--;
 350         }
 351
 352         /*
 353          * We don't shrink the info_req_list size for now.
 354          */
 355         (void) mutex_unlock(&rcm_req_lock);
 356 }
 357
 358 /*
 359  * Checking lock conflicts. There is a conflict if:
 360  * - attempt to DR a node when either its ancester or descendent
 361  *      is in the process of DR
 362  * - attempt to register for a node when its ancester is locked for DR
 363  */
 364 static int
 365 check_lock(char *device, uint_t flag, int cflag, rcm_info_t **info)
 366 {
 367         int i, ret = RCM_SUCCESS;
 368
 369         if (info)
 370                 *info = NULL;
 371
 372         /*
 373          * During daemon initialization, don't check locks
 374          */
 375         if (dr_req_list == NULL)
 376                 return (ret);
 377
 378         for (i = 0; i < dr_req_list->n_req; i++) {
 379                 req_t *req = &dr_req_list->req[i];
 380                 char *dr_dev = req->device;
 381
 382                 /*
 383                  * Skip empty entries
 384                  */
 385                 if ((req->state == RCM_STATE_REMOVE) || (dr_dev[0] == '\0'))
 386                         continue;
 387
 388                 /*
 389                  * Make sure that none of the ancestors of dr_dev is
 390                  * being operated upon.
 391                  */
 392                 if (EQUAL(device, dr_dev) || DESCENDENT(device, dr_dev)) {
 393                         /*
 394                          * An exception to this is the filesystem.
 395                          * We should allowed a filesystem rooted at a
 396                          * child directory to be unmounted.
 397                          */
 398                         if ((flag & RCM_FILESYS) && (!EQUAL(device, dr_dev) ||
 399                             ((dr_req_list->req[i].flag & RCM_FILESYS) == 0)))
 400                                 continue;
 401
 402                         assert(info != 0);
 403
 404                         add_busy_rsrc_to_list(dr_dev, dr_req_list->req[i].pid,
 405                             dr_req_list->req[i].state,
 406                             dr_req_list->req[i].seq_num, NULL, locked_info,
 407                             locked_err, NULL, info);
 408                         ret = RCM_CONFLICT;
 409                         break;
 410                 }
 411
 412                 if ((cflag == LOCK_FOR_DR) && DESCENDENT(dr_dev, device)) {
 413                         /*
 414                          * Check descendents only for DR request.
 415                          *
 416                          * Could have multiple descendents doing DR,
 417                          * we want to find them all.
 418                          */
 419                         assert(info != 0);
 420
 421                         add_busy_rsrc_to_list(dr_dev, dr_req_list->req[i].pid,
 422                             dr_req_list->req[i].state,
 423                             dr_req_list->req[i].seq_num, NULL, locked_info,
 424                             locked_err, NULL, info);
 425                         ret = RCM_CONFLICT;
 426                         /* don't break here, need to find all conflicts */
 427                 }
 428         }
 429
 430         return (ret);
 431 }
 432
 433 /*
 434  * Check for lock conflicts for DR operation or client registration
 435  */
 436 int
 437 rsrc_check_lock_conflicts(char *rsrcname, uint_t flag, int cflag,
 438     rcm_info_t **info)
 439 {
 440         int result;
 441         char *device;
 442
 443         device = resolve_name(rsrcname);
 444         result = check_lock(device, flag, cflag, info);
 445         free(device);
 446
 447         return (result);
 448 }
 449
 450 static int
 451 transition_state(int state)
 452 {
 453         /*
 454          * If the resource state is in transition, ask caller to
 455          * try again.
 456          */
 457         switch (state) {
 458         case RCM_STATE_OFFLINING:
 459         case RCM_STATE_SUSPENDING:
 460         case RCM_STATE_RESUMING:
 461         case RCM_STATE_ONLINING:
 462         case RCM_STATE_REMOVING:
 463
 464                 return (1);
 465
 466         default:
 467                 /*FALLTHROUGH*/
 468                 break;
 469         }
 470         return (0);
 471 }
 472
 473 /*
 474  * Update a dr entry in dr_req_list
 475  */
 476 /*ARGSUSED*/
 477 static int
 478 dr_req_update_entry(char *device, pid_t pid, uint_t flag, int state,
 479     int seq_num, timespec_t *interval, rcm_info_t **infop)
 480 {
 481         req_t *req;
 482
 483         /*
 484          * Find request entry. If not found, return RCM_FAILURE
 485          */
 486         req = find_req_entry(device, flag, -1, dr_req_list);
 487
 488         if (req == NULL) {
 489                 switch (state) {
 490                 case RCM_STATE_OFFLINE_QUERYING:
 491                 case RCM_STATE_SUSPEND_QUERYING:
 492                 case RCM_STATE_OFFLINING:
 493                 case RCM_STATE_SUSPENDING:
 494                         /* could be re-do operation, no error message */
 495                         break;
 496
 497                 default:
 498                         rcm_log_message(RCM_DEBUG,
 499                             "update non-existing resource %s\n", device);
 500                 }
 501                 return (RCM_FAILURE);
 502         }
 503
 504         /*
 505          * During initialization, update is unconditional (forced)
 506          * in order to bring the daemon up in a sane state.
 507          */
 508         if (rcmd_get_state() == RCMD_INIT)
 509                 goto update;
 510
 511         /*
 512          * Don't allow update with mismatched initiator pid. This could happen
 513          * as part of normal operation.
 514          */
 515         if (pid != req->pid) {
 516                 rcm_log_message(RCM_INFO,
 517                     gettext("mismatched dr initiator pid: %ld %ld\n"),
 518                     req->pid, pid);
 519                 goto failure;
 520         }
 521
 522         rcm_log_message(RCM_TRACE4,
 523             "dr_req_update_entry: state=%d, device=%s\n",
 524             req->state, req->device);
 525
 526         /*
 527          * Check that the state transition is valid
 528          */
 529         switch (state) {
 530         case RCM_STATE_OFFLINE_QUERYING:
 531         case RCM_STATE_OFFLINING:
 532                 /*
 533                  * This is the case of re-offlining, which applies only
 534                  * if a previous attempt failed.
 535                  */
 536                 if ((req->state != RCM_STATE_OFFLINE_FAIL) &&
 537                     (req->state != RCM_STATE_OFFLINE_QUERYING) &&
 538                     (req->state != RCM_STATE_OFFLINE_QUERY) &&
 539                     (req->state != RCM_STATE_OFFLINE_QUERY_FAIL) &&
 540                     (req->state != RCM_STATE_OFFLINE)) {
 541                         rcm_log_message(RCM_WARNING,
 542                             gettext("%s: invalid offlining from state %d\n"),
 543                             device, req->state);
 544                         goto failure;
 545                 }
 546                 break;
 547
 548         case RCM_STATE_SUSPEND_QUERYING:
 549         case RCM_STATE_SUSPENDING:
 550                 /*
 551                  * This is the case of re-suspending, which applies only
 552                  * if a previous attempt failed.
 553                  */
 554                 if ((req->state != RCM_STATE_SUSPEND_FAIL) &&
 555                     (req->state != RCM_STATE_SUSPEND_QUERYING) &&
 556                     (req->state != RCM_STATE_SUSPEND_QUERY) &&
 557                     (req->state != RCM_STATE_SUSPEND_QUERY_FAIL) &&
 558                     (req->state != RCM_STATE_SUSPEND)) {
 559                         rcm_log_message(RCM_WARNING,
 560                             gettext("%s: invalid suspending from state %d\n"),
 561                             device, req->state);
 562                         goto failure;
 563                 }
 564                 break;
 565
 566         case RCM_STATE_RESUMING:
 567                 if ((req->state != RCM_STATE_SUSPEND) &&
 568                     (req->state != RCM_STATE_SUSPEND_QUERYING) &&
 569                     (req->state != RCM_STATE_SUSPEND_QUERY) &&
 570                     (req->state != RCM_STATE_SUSPEND_QUERY_FAIL) &&
 571                     (req->state != RCM_STATE_SUSPEND_FAIL)) {
 572                         rcm_log_message(RCM_DEBUG,
 573                             "%s: invalid resuming from state %d\n",
 574                             device, req->state);
 575                         goto failure;
 576                 }
 577                 break;
 578
 579         case RCM_STATE_ONLINING:
 580                 if ((req->state != RCM_STATE_OFFLINE) &&
 581                     (req->state != RCM_STATE_OFFLINE_QUERYING) &&
 582                     (req->state != RCM_STATE_OFFLINE_QUERY) &&
 583                     (req->state != RCM_STATE_OFFLINE_QUERY_FAIL) &&
 584                     (req->state != RCM_STATE_OFFLINE_FAIL)) {
 585                         rcm_log_message(RCM_INFO,
 586                             gettext("%s: invalid onlining from state %d\n"),
 587                             device, req->state);
 588                         goto failure;
 589                 }
 590                 break;
 591
 592         case RCM_STATE_REMOVING:
 593                 if ((req->state != RCM_STATE_OFFLINE) &&
 594                     (req->state != RCM_STATE_OFFLINE_FAIL)) {
 595                         rcm_log_message(RCM_INFO,
 596                             gettext("%s: invalid removing from state %d\n"),
 597                             device, req->state);
 598                         goto failure;
 599                 }
 600                 break;
 601
 602         case RCM_STATE_SUSPEND_FAIL:
 603                 assert(req->state == RCM_STATE_SUSPENDING);
 604                 break;
 605
 606         case RCM_STATE_OFFLINE_FAIL:
 607                 assert(req->state == RCM_STATE_OFFLINING);
 608                 break;
 609
 610         case RCM_STATE_SUSPEND:
 611                 assert(req->state == RCM_STATE_SUSPENDING);
 612                 break;
 613
 614         case RCM_STATE_OFFLINE:
 615                 assert(req->state == RCM_STATE_OFFLINING);
 616                 break;
 617
 618         case RCM_STATE_ONLINE:
 619                 assert((req->state == RCM_STATE_RESUMING) ||
 620                     (req->state == RCM_STATE_ONLINING));
 621                 break;
 622
 623         default:        /* shouldn't be here */
 624                 rcm_log_message(RCM_ERROR,
 625                     gettext("invalid update to dr state: %d\n"), state);
 626                 return (RCM_FAILURE);
 627         }
 628
 629 update:
 630         /*
 631          * update the state, interval, and sequence number; sync state file
 632          */
 633         req->state = state;
 634         req->seq_num = seq_num;
 635
 636         if (interval)
 637                 req->interval = *interval;
 638         else
 639                 bzero(&req->interval, sizeof (timespec_t));
 640
 641         (void) fsync(state_fd);
 642         return (RCM_SUCCESS);
 643
 644 failure:
 645         if (infop != NULL) {
 646                 add_busy_rsrc_to_list(req->device, req->pid, req->state,
 647                     req->seq_num, NULL, locked_info, locked_err, NULL, infop);
 648         }
 649
 650         /*
 651          * A request may be left in a transition state because the operator
 652          * typed ctrl-C. In this case, the daemon thread continues to run
 653          * and will eventually put the state in a non-transitional state.
 654          *
 655          * To be safe, we return EAGAIN to allow librcm to loop and retry.
 656          * If we are called from a module, loop & retry could result in a
 657          * deadlock. The called will check for this case and turn EAGAIN
 658          * into RCM_CONFLICT.
 659          */
 660         if (transition_state(req->state)) {
 661                 return (EAGAIN);
 662         }
 663
 664         return (RCM_CONFLICT);
 665 }
 666
 667 /*
 668  * Insert a dr entry in dr_req_list
 669  */
 670 int
 671 dr_req_add(char *rsrcname, pid_t pid, uint_t flag, int state, int seq_num,
 672     timespec_t *interval, rcm_info_t **info)
 673 {
 674         int error;
 675         char *device;
 676         req_t *req;
 677
 678         rcm_log_message(RCM_TRACE3, "dr_req_add(%s, %ld, 0x%x, %d, %d, %p)\n",
 679             rsrcname, pid, flag, state, seq_num, (void *)info);
 680
 681         device = resolve_name(rsrcname);
 682         if (device == NULL)
 683                 return (EINVAL);
 684
 685         (void) mutex_lock(&rcm_req_lock);
 686
 687         /*
 688          * In the re-offline/suspend case, attempt to update dr request.
 689          *
 690          * If this succeeds, return success;
 691          * If this fails because of a conflict, return error;
 692          * If this this fails because no entry exists, add a new entry.
 693          */
 694         error = dr_req_update_entry(device, pid, flag, state, seq_num, interval,
 695             info);
 696
 697         switch (error) {
 698         case RCM_FAILURE:
 699                 /* proceed to add a new entry */
 700                 break;
 701
 702         case RCM_CONFLICT:
 703         case RCM_SUCCESS:
 704         case EAGAIN:
 705         default:
 706                 goto out;
 707         }
 708
 709         /*
 710          * Check for lock conflicts
 711          */
 712         error = check_lock(device, flag, LOCK_FOR_DR, info);
 713         if (error != RCM_SUCCESS) {
 714                 error = RCM_CONFLICT;
 715                 goto out;
 716         }
 717
 718         /*
 719          * Get empty request entry, fill in values and sync state file
 720          */
 721         req = get_req_entry(&dr_req_list);
 722
 723         req->seq_num = seq_num;
 724         req->pid = pid;
 725         req->flag = flag;
 726         req->state = state;
 727         req->type = rsrc_get_type(device);
 728         (void) strcpy(req->device, device);
 729
 730         /* cache interval for failure recovery */
 731         if (interval)
 732                 req->interval = *interval;
 733         else
 734                 bzero(&req->interval, sizeof (timespec_t));
 735
 736         (void) fsync(state_fd);
 737
 738         /*
 739          * Add initiator pid to polling list
 740          */
 741         add_to_polling_list(req->pid);
 742
 743 out:
 744         (void) mutex_unlock(&rcm_req_lock);
 745         free(device);
 746
 747         return (error);
 748 }
 749
 750 /*
 751  * Update a dr entry in dr_req_list
 752  */
 753 /*ARGSUSED*/
 754 int
 755 dr_req_update(char *rsrcname, pid_t pid, uint_t flag, int state, int seq_num,
 756     rcm_info_t **info)
 757 {
 758         int error;
 759         char *device = resolve_name(rsrcname);
 760
 761         rcm_log_message(RCM_TRACE3, "dr_req_update(%s, %ld, 0x%x, %d, %d)\n",
 762             rsrcname, pid, flag, state, seq_num);
 763
 764         (void) mutex_lock(&rcm_req_lock);
 765         error = dr_req_update_entry(device, pid, flag, state, seq_num, NULL,
 766             info);
 767         (void) mutex_unlock(&rcm_req_lock);
 768         free(device);
 769
 770         return (error);
 771 }
 772
 773 /*
 774  * This function scans the DR request list for the next, non-removed
 775  * entry that is part of the specified sequence.  The 'device' name
 776  * of the entry is copied into the provided 'rsrc' buffer.
 777  *
 778  * The 'rsrc' buffer is required because the DR request list is only
 779  * locked during the duration of this lookup.  Giving a direct pointer
 780  * to something in the list would be unsafe.
 781  */
 782 int
 783 dr_req_lookup(int seq_num, char *rsrc)
 784 {
 785         int     i;
 786         int     len;
 787         int     base = (seq_num >> SEQ_NUM_SHIFT);
 788         int     retval = RCM_FAILURE;
 789
 790         if (rsrc == NULL) {
 791                 return (RCM_FAILURE);
 792         }
 793
 794         (void) mutex_lock(&rcm_req_lock);
 795
 796         for (i = 0; i < dr_req_list->n_req_max; i++) {
 797
 798                 /* Skip removed or non-matching entries */
 799                 if ((dr_req_list->req[i].state == RCM_STATE_REMOVE) ||
 800                     ((dr_req_list->req[i].seq_num >> SEQ_NUM_SHIFT) != base)) {
 801                         continue;
 802                 }
 803
 804                 /* Copy the next-matching 'device' name into 'rsrc' */
 805                 len = strlcpy(rsrc, dr_req_list->req[i].device, MAXPATHLEN);
 806                 if (len < MAXPATHLEN) {
 807                         retval = RCM_SUCCESS;
 808                 }
 809                 break;
 810         }
 811
 812         (void) mutex_unlock(&rcm_req_lock);
 813
 814         return (retval);
 815 }
 816
 817 /*
 818  * Remove a dr entry in dr_req_list
 819  */
 820 void
 821 dr_req_remove(char *rsrcname, uint_t flag)
 822 {
 823         req_t *req;
 824         char *device = resolve_name(rsrcname);
 825
 826         rcm_log_message(RCM_TRACE3, "dr_req_remove(%s)\n", rsrcname);
 827
 828         (void) mutex_lock(&rcm_req_lock);
 829
 830         /* find entry */
 831         req = find_req_entry(device, flag, -1, dr_req_list);
 832         free(device);
 833
 834         if (req == NULL) {
 835                 (void) mutex_unlock(&rcm_req_lock);
 836                 rcm_log_message(RCM_WARNING,
 837                     gettext("dr_req entry %s not found\n"), rsrcname);
 838                 return;
 839         }
 840
 841         req->state = RCM_STATE_REMOVE;
 842         dr_req_list->n_req--;
 843         (void) fsync(state_fd);
 844
 845         /*
 846          * remove pid from polling list
 847          */
 848         remove_from_polling_list(req->pid);
 849
 850         /*
 851          * We don't shrink the dr_req_list size for now.
 852          * Shouldn't cause big memory leaks.
 853          */
 854         (void) mutex_unlock(&rcm_req_lock);
 855 }
 856
 857 /*
 858  * Return the list of ongoing dr operation requests
 859  */
 860 rcm_info_t *
 861 rsrc_dr_info()
 862 {
 863         int i;
 864         rcm_info_t *info;
 865         rcm_info_t *result = NULL;
 866         char *rsrc;
 867         int len;
 868
 869         rcm_log_message(RCM_TRACE2, "rsrc_dr_info()\n");
 870
 871         (void) mutex_lock(&rcm_req_lock);
 872         for (i = 0; i < dr_req_list->n_req_max; i++) {
 873                 if (dr_req_list->req[i].state == RCM_STATE_REMOVE)
 874                         continue;
 875
 876                 if (dr_req_list->req[i].device[0] == '\0')
 877                         continue;
 878
 879                 if (dr_req_list->req[i].flag & RCM_FILESYS) {
 880                         len = strlen(dr_req_list->req[i].device) + 5;
 881                         rsrc = s_malloc(len);
 882                         (void) snprintf(rsrc, len, "%s(fs)",
 883                             dr_req_list->req[i].device);
 884                 } else {
 885                         rsrc = s_strdup(dr_req_list->req[i].device);
 886                 }
 887
 888                 info = s_calloc(1, sizeof (*info));
 889                 if (errno = nvlist_alloc(&(info->info), NV_UNIQUE_NAME, 0)) {
 890                         rcm_log_message(RCM_ERROR,
 891                             gettext("failed (nvlist_alloc=%s).\n"),
 892                             strerror(errno));
 893                         rcmd_exit(errno);
 894                 }
 895
 896                 if (errno = nvlist_add_string(info->info, RCM_RSRCNAME, rsrc)) {
 897                         rcm_log_message(RCM_ERROR,
 898                             gettext("failed (nvlist_add=%s).\n"),
 899                             strerror(errno));
 900                         rcmd_exit(errno);
 901                 }
 902                 (void) free(rsrc);
 903
 904                 if (errno = nvlist_add_int64(info->info, RCM_CLIENT_ID,
 905                     dr_req_list->req[i].pid)) {
 906                         rcm_log_message(RCM_ERROR,
 907                             gettext("failed (nvlist_add=%s).\n"),
 908                             strerror(errno));
 909                         rcmd_exit(errno);
 910                 }
 911
 912                 if (errno = nvlist_add_int32(info->info, RCM_SEQ_NUM,
 913                     dr_req_list->req[i].seq_num)) {
 914                         rcm_log_message(RCM_ERROR,
 915                             gettext("failed (nvlist_add=%s).\n"),
 916                             strerror(errno));
 917                         rcmd_exit(errno);
 918                 }
 919
 920                 if (errno = nvlist_add_int32(info->info, RCM_RSRCSTATE,
 921                     dr_req_list->req[i].state)) {
 922                         rcm_log_message(RCM_ERROR,
 923                             gettext("failed (nvlist_add=%s).\n"),
 924                             strerror(errno));
 925                         rcmd_exit(errno);
 926                 }
 927
 928                 if (errno = nvlist_add_string(info->info, RCM_CLIENT_INFO,
 929                     (char *)locked_info)) {
 930                         rcm_log_message(RCM_ERROR,
 931                             gettext("failed (nvlist_add=%s).\n"),
 932                             strerror(errno));
 933                         rcmd_exit(errno);
 934                 }
 935
 936                 info->next = result;
 937                 result = info;
 938         }
 939         (void) mutex_unlock(&rcm_req_lock);
 940
 941         return (result);
 942 }
 943
 944 /*
 945  * Eliminate entries whose dr initiator is no longer running
 946  * and recover daemon state during daemon restart.
 947  *
 948  * This routine is called from either during daemon initialization
 949  * after all modules have registered resources or from the cleanup
 950  * thread. In either case, it is the only thread running in the
 951  * daemon.
 952  */
 953 void
 954 clean_dr_list()
 955 {
 956         int i;
 957         struct clean_list {
 958                 struct clean_list *next;
 959                 char *rsrcname;
 960                 pid_t pid;
 961                 int seq_num;
 962                 int state;
 963                 timespec_t interval;
 964         } *tmp, *list = NULL;
 965         char *rsrcnames[2];
 966
 967         rcm_log_message(RCM_TRACE3,
 968             "clean_dr_list(): look for stale dr initiators\n");
 969
 970         rsrcnames[1] = NULL;
 971
 972         /*
 973          * Make a list of entries to recover. This is necessary because
 974          * the recovery operation will modify dr_req_list.
 975          */
 976         (void) mutex_lock(&rcm_req_lock);
 977         for (i = 0; i < dr_req_list->n_req_max; i++) {
 978                 /* skip empty entries */
 979                 if (dr_req_list->req[i].state == RCM_STATE_REMOVE)
 980                         continue;
 981
 982                 if (dr_req_list->req[i].device[0] == '\0')
 983                         continue;
 984
 985                 /* skip cascade operations */
 986                 if (dr_req_list->req[i].seq_num & SEQ_NUM_MASK)
 987                         continue;
 988
 989                 /*
 990                  * In the cleanup case, ignore entries with initiators alive
 991                  */
 992                 if ((rcmd_get_state() == RCMD_CLEANUP) &&
 993                     proc_exist(dr_req_list->req[i].pid))
 994                         continue;
 995
 996                 rcm_log_message(RCM_TRACE1,
 997                     "found stale entry: %s\n", dr_req_list->req[i].device);
 998
 999                 tmp = s_malloc(sizeof (*tmp));
1000                 tmp->rsrcname = s_strdup(dr_req_list->req[i].device);
1001                 tmp->state = dr_req_list->req[i].state;
1002                 tmp->pid = dr_req_list->req[i].pid;
1003                 tmp->seq_num = dr_req_list->req[i].seq_num;
1004                 tmp->interval = dr_req_list->req[i].interval;
1005                 tmp->next = list;
1006                 list = tmp;
1007         }
1008         (void) mutex_unlock(&rcm_req_lock);
1009
1010         if (list == NULL)
1011                 return;
1012
1013         /*
1014          * If everything worked normally, we shouldn't be here.
1015          * Since we are here, something went wrong, so say something.
1016          */
1017         if (rcmd_get_state() == RCMD_INIT) {
1018                 rcm_log_message(RCM_NOTICE, gettext("rcm_daemon died "
1019                     "unexpectedly, recovering previous daemon state\n"));
1020         } else {
1021                 rcm_log_message(RCM_INFO, gettext("one or more dr initiator "
1022                     "died, attempting automatic recovery\n"));
1023         }
1024
1025         while (list) {
1026                 tmp = list;
1027                 list = tmp->next;
1028
1029                 switch (tmp->state) {
1030                 case RCM_STATE_OFFLINE_QUERY:
1031                 case RCM_STATE_OFFLINE_QUERY_FAIL:
1032                         rsrcnames[0] = tmp->rsrcname;
1033                         if (proc_exist(tmp->pid)) {
1034                                 /* redo */
1035                                 (void) process_resource_offline(rsrcnames,
1036                                     tmp->pid, RCM_QUERY, tmp->seq_num, NULL);
1037                         } else {
1038                                 /* undo */
1039                                 (void) notify_resource_online(rsrcnames,
1040                                     tmp->pid, 0, tmp->seq_num, NULL);
1041                         }
1042                         break;
1043
1044                 case RCM_STATE_OFFLINE:
1045                 case RCM_STATE_OFFLINE_FAIL:
1046                         rsrcnames[0] = tmp->rsrcname;
1047                         if (proc_exist(tmp->pid)) {
1048                                 /* redo */
1049                                 (void) process_resource_offline(rsrcnames,
1050                                     tmp->pid, 0, tmp->seq_num, NULL);
1051                         } else {
1052                                 /* undo */
1053                                 (void) notify_resource_online(rsrcnames,
1054                                     tmp->pid, 0, tmp->seq_num, NULL);
1055                         }
1056                         break;
1057
1058                 case RCM_STATE_SUSPEND_QUERY:
1059                 case RCM_STATE_SUSPEND_QUERY_FAIL:
1060                         rsrcnames[0] = tmp->rsrcname;
1061                         if (proc_exist(tmp->pid)) {
1062                                 /* redo */
1063                                 (void) process_resource_suspend(rsrcnames,
1064                                     tmp->pid, RCM_QUERY, tmp->seq_num,
1065                                     &tmp->interval, NULL);
1066                         } else {
1067                                 /* undo */
1068                                 (void) notify_resource_resume(rsrcnames,
1069                                     tmp->pid, 0, tmp->seq_num, NULL);
1070                         }
1071                         break;
1072
1073                 case RCM_STATE_SUSPEND:
1074                 case RCM_STATE_SUSPEND_FAIL:
1075                         rsrcnames[0] = tmp->rsrcname;
1076                         if (proc_exist(tmp->pid)) {
1077                                 /* redo */
1078                                 (void) process_resource_suspend(rsrcnames,
1079                                     tmp->pid, 0, tmp->seq_num, &tmp->interval,
1080                                     NULL);
1081                         } else {
1082                                 /* undo */
1083                                 (void) notify_resource_resume(rsrcnames,
1084                                     tmp->pid, 0, tmp->seq_num, NULL);
1085                         }
1086                         break;
1087
1088                 case RCM_STATE_OFFLINING:
1089                 case RCM_STATE_ONLINING:
1090                         rsrcnames[0] = tmp->rsrcname;
1091                         (void) notify_resource_online(rsrcnames, tmp->pid, 0,
1092                             tmp->seq_num, NULL);
1093                         break;
1094
1095                 case RCM_STATE_SUSPENDING:
1096                 case RCM_STATE_RESUMING:
1097                         rsrcnames[0] = tmp->rsrcname;
1098                         (void) notify_resource_resume(rsrcnames, tmp->pid, 0,
1099                             tmp->seq_num, NULL);
1100                         break;
1101
1102                 case RCM_STATE_REMOVING:
1103                         rsrcnames[0] = tmp->rsrcname;
1104                         (void) notify_resource_remove(rsrcnames, tmp->pid, 0,
1105                             tmp->seq_num, NULL);
1106                         break;
1107
1108                 default:
1109                         rcm_log_message(RCM_WARNING,
1110                             gettext("%s in unknown state %d\n"),
1111                             tmp->rsrcname, tmp->state);
1112                         break;
1113                 }
1114                 free(tmp->rsrcname);
1115                 free(tmp);
1116         }
1117 }
1118
1119 /*
1120  * Selected thread blocking based on event type
1121  */
1122 barrier_t barrier;
1123
1124 /*
1125  * Change barrier state:
1126  *      RCMD_INIT - daemon is intializing, only register allowed
1127  *      RCMD_NORMAL - normal daemon processing
1128  *      RCMD_CLEANUP - cleanup thread is waiting or running
1129  */
1130 int
1131 rcmd_get_state()
1132 {
1133         return (barrier.state);
1134 }
1135
1136 void
1137 rcmd_set_state(int state)
1138 {
1139         /*
1140          * The state transition is as follows:
1141          *      INIT --> NORMAL <---> CLEANUP
1142          * The implementation favors the cleanup thread
1143          */
1144
1145         (void) mutex_lock(&barrier.lock);
1146         barrier.state = state;
1147
1148         switch (state) {
1149         case RCMD_CLEANUP:
1150                 /*
1151                  * Wait for existing threads to exit
1152                  */
1153                 barrier.wanted++;
1154                 while (barrier.thr_count != 0)
1155                         (void) cond_wait(&barrier.cv, &barrier.lock);
1156                 barrier.wanted--;
1157                 barrier.thr_count = -1;
1158                 break;
1159
1160         case RCMD_INIT:
1161         case RCMD_NORMAL:
1162         default:
1163                 if (barrier.thr_count == -1)
1164                         barrier.thr_count = 0;
1165                 if (barrier.wanted)
1166                         (void) cond_broadcast(&barrier.cv);
1167                 break;
1168         }
1169
1170         (void) mutex_unlock(&barrier.lock);
1171 }
1172
1173 /*
1174  * Increment daemon thread count
1175  */
1176 int
1177 rcmd_thr_incr(int cmd)
1178 {
1179         int seq_num;
1180
1181         (void) mutex_lock(&barrier.lock);
1182         /*
1183          * Set wanted flag
1184          */
1185         barrier.wanted++;
1186
1187         /*
1188          * Wait till it is safe for daemon to perform the operation
1189          *
1190          * NOTE: if a module registers by passing a request to the
1191          *      client proccess, we may need to allow register
1192          *      to come through during daemon initialization.
1193          */
1194         while (barrier.state != RCMD_NORMAL)
1195                 (void) cond_wait(&barrier.cv, &barrier.lock);
1196
1197         if ((cmd == CMD_EVENT) ||
1198             (cmd == CMD_REGISTER) ||
1199             (cmd == CMD_UNREGISTER)) {
1200                 /*
1201                  * Event passthru and register ops don't need sequence number
1202                  */
1203                 seq_num = -1;
1204         } else {
1205                 /*
1206                  * Non register operation gets a sequence number
1207                  */
1208                 seq_num = get_seq_number();
1209         }
1210         barrier.wanted--;
1211         barrier.thr_count++;
1212         (void) mutex_unlock(&barrier.lock);
1213
1214         if ((cmd == CMD_OFFLINE) ||
1215             (cmd == CMD_SUSPEND) ||
1216             (cmd == CMD_GETINFO)) {
1217                 /*
1218                  * For these operations, need to ask modules to
1219                  * register any new resources that came online.
1220                  *
1221                  * This is because mount/umount are not instrumented
1222                  * to register with rcm before using system resources.
1223                  * Certain registration ops may fail during sync, which
1224                  * indicates race conditions. This cannot be avoided
1225                  * without changing mount/umount.
1226                  */
1227                 rcmd_db_sync();
1228         }
1229
1230         return (seq_num);
1231 }
1232
1233 /*
1234  * Decrement thread count
1235  */
1236 void
1237 rcmd_thr_decr()
1238 {
1239         /*
1240          * Decrement thread count and wake up reload/cleanup thread.
1241          */
1242         (void) mutex_lock(&barrier.lock);
1243         barrier.last_update = time(NULL);
1244         if (--barrier.thr_count == 0)
1245                 (void) cond_broadcast(&barrier.cv);
1246         (void) mutex_unlock(&barrier.lock);
1247 }
1248
1249 /*
1250  * Wakeup all waiting threads as a result of SIGHUP
1251  */
1252 static int sighup_received = 0;
1253
1254 void
1255 rcmd_thr_signal()
1256 {
1257         (void) mutex_lock(&barrier.lock);
1258         sighup_received = 1;
1259         (void) cond_broadcast(&barrier.cv);
1260         (void) mutex_unlock(&barrier.lock);
1261 }
1262
1263 void
1264 rcmd_start_timer(int timeout)
1265 {
1266         timestruc_t abstime;
1267
1268         if (timeout == 0)
1269                 timeout = RCM_DAEMON_TIMEOUT;   /* default to 5 minutes */
1270         else
1271                 dr_req_list->idle_timeout = timeout;    /* persist timeout */
1272
1273         if (timeout > 0) {
1274                 abstime.tv_sec = time(NULL) + timeout;
1275         }
1276
1277         (void) mutex_lock(&barrier.lock);
1278         for (;;) {
1279                 int idletime;
1280                 int is_active;
1281
1282                 if (timeout > 0)
1283                         (void) cond_timedwait(&barrier.cv, &barrier.lock,
1284                             &abstime);
1285                 else
1286                         (void) cond_wait(&barrier.cv, &barrier.lock);
1287
1288                 /*
1289                  * If sighup received, change timeout to 0 so the daemon is
1290                  * shut down at the first possible moment
1291                  */
1292                 if (sighup_received)
1293                         timeout = 0;
1294
1295                 /*
1296                  * If timeout is negative, never shutdown the daemon
1297                  */
1298                 if (timeout < 0)
1299                         continue;
1300
1301                 /*
1302                  * Check for ongoing/pending activity
1303                  */
1304                 is_active = (barrier.thr_count || barrier.wanted ||
1305                     (dr_req_list->n_req != 0));
1306                 if (is_active) {
1307                         abstime.tv_sec = time(NULL) + timeout;
1308                         continue;
1309                 }
1310
1311                 /*
1312                  * If idletime is less than timeout, continue to wait
1313                  */
1314                 idletime = time(NULL) - barrier.last_update;
1315                 if (idletime < timeout) {
1316                         abstime.tv_sec = barrier.last_update + timeout;
1317                         continue;
1318                 }
1319                 break;
1320         }
1321
1322         (void) script_main_fini();
1323
1324         rcm_log_message(RCM_INFO, gettext("rcm_daemon is shut down.\n"));
1325 }
1326
1327 /*
1328  * Code related to polling client pid's
1329  * Not declared as static so that we can find this structure easily
1330  * in the core file.
1331  */
1332 struct {
1333         int             n_pids;
1334         int             n_max_pids;
1335         thread_t        poll_tid;       /* poll thread id */
1336         int             signaled;
1337         pid_t           *pids;
1338         int             *refcnt;
1339         struct pollfd   *fds;
1340         cond_t          cv;     /* the associated lock is rcm_req_lock */
1341 } polllist;
1342
1343 static int
1344 find_pid_index(pid_t pid)
1345 {
1346         int i;
1347
1348         for (i = 0; i < polllist.n_pids; i++) {
1349                 if (polllist.pids[i] == pid) {
1350                         return (i);
1351                 }
1352         }
1353         return (-1);
1354 }
1355
1356 /*
1357  * Resize buffer for new pids
1358  */
1359 static int
1360 get_pid_index()
1361 {
1362         const int n_chunk = 10;
1363
1364         int n_max;
1365         int index = polllist.n_pids;
1366
1367         if (polllist.n_pids < polllist.n_max_pids) {
1368                 polllist.n_pids++;
1369                 return (index);
1370         }
1371
1372         if (polllist.n_max_pids == 0) {
1373                 n_max = n_chunk;
1374                 polllist.pids = s_calloc(n_max, sizeof (pid_t));
1375                 polllist.refcnt = s_calloc(n_max, sizeof (int));
1376                 polllist.fds = s_calloc(n_max, sizeof (struct pollfd));
1377         } else {
1378                 n_max = polllist.n_max_pids + n_chunk;
1379                 polllist.pids = s_realloc(polllist.pids,
1380                     n_max * sizeof (pid_t));
1381                 polllist.refcnt = s_realloc(polllist.refcnt,
1382                     n_max * sizeof (int));
1383                 polllist.fds = s_realloc(polllist.fds,
1384                     n_max * sizeof (struct pollfd));
1385         }
1386         polllist.n_max_pids = n_max;
1387         polllist.n_pids++;
1388         return (index);
1389 }
1390
1391 /*
1392  * rcm_req_lock must be held
1393  */
1394 static void
1395 add_to_polling_list(pid_t pid)
1396 {
1397         int fd, index;
1398         char procfile[MAXPATHLEN];
1399
1400         if (pid == (pid_t)0)
1401                 return;
1402
1403         rcm_log_message(RCM_TRACE1, "add_to_polling_list(%ld)\n", pid);
1404
1405         /*
1406          * Need to stop the poll thread before manipulating the polllist
1407          * since poll thread may possibly be using polllist.fds[] and
1408          * polllist.n_pids. As an optimization, first check if the pid
1409          * is already in the polllist. If it is, there is no need to
1410          * stop the poll thread. Just increment the pid reference count
1411          * and return;
1412          */
1413         index = find_pid_index(pid);
1414         if (index != -1) {
1415                 polllist.refcnt[index]++;
1416                 return;
1417         }
1418
1419         stop_polling_thread();
1420
1421         /*
1422          * In an attempt to stop the poll thread we may have released
1423          * and reacquired rcm_req_lock. So find the index again.
1424          */
1425         index = find_pid_index(pid);
1426         if (index != -1) {
1427                 polllist.refcnt[index]++;
1428                 goto done;
1429         }
1430
1431         /*
1432          * Open a /proc file
1433          */
1434         (void) sprintf(procfile, "/proc/%ld/as", pid);
1435         if ((fd = open(procfile, O_RDONLY)) == -1) {
1436                 rcm_log_message(RCM_NOTICE, gettext("open(%s): %s\n"),
1437                     procfile, strerror(errno));
1438                 goto done;
1439         }
1440
1441         /*
1442          * add pid to polllist
1443          */
1444         index = get_pid_index();
1445         polllist.pids[index] = pid;
1446         polllist.refcnt[index] = 1;
1447         polllist.fds[index].fd = fd;
1448         polllist.fds[index].events = 0;
1449         polllist.fds[index].revents = 0;
1450
1451         rcm_log_message(RCM_DEBUG, "add pid %ld at index %ld\n", pid, index);
1452
1453 done:
1454         start_polling_thread();
1455 }
1456
1457 /*
1458  * rcm_req_lock must be held
1459  */
1460 static void
1461 remove_from_polling_list(pid_t pid)
1462 {
1463         int i, index;
1464
1465         if (pid == (pid_t)0)
1466                 return;
1467
1468         rcm_log_message(RCM_TRACE1, "remove_from_polling_list(%ld)\n", pid);
1469
1470         /*
1471          * Need to stop the poll thread before manipulating the polllist
1472          * since poll thread may possibly be using polllist.fds[] and
1473          * polllist.n_pids. As an optimization, first check the pid
1474          * reference count. If the pid reference count is greater than 1
1475          * there is no need to stop the polling thread.
1476          */
1477
1478         index = find_pid_index(pid);
1479         if (index == -1) {
1480                 rcm_log_message(RCM_NOTICE,
1481                     gettext("error removing pid %ld from polling list\n"), pid);
1482                 return;
1483         }
1484
1485         /*
1486          * decrement the pid refcnt
1487          */
1488         if (polllist.refcnt[index] > 1) {
1489                 polllist.refcnt[index]--;
1490                 return;
1491         }
1492
1493         stop_polling_thread();
1494
1495         /*
1496          * In an attempt to stop the poll thread we may have released
1497          * and reacquired rcm_req_lock. So find the index again.
1498          */
1499         index = find_pid_index(pid);
1500         if (index == -1) {
1501                 rcm_log_message(RCM_NOTICE,
1502                     gettext("error removing pid %ld from polling list\n"), pid);
1503                 goto done;
1504         }
1505
1506         if (--polllist.refcnt[index] > 0)
1507                 goto done;
1508
1509         /*
1510          * refcnt down to zero, delete pid from polling list
1511          */
1512         (void) close(polllist.fds[index].fd);
1513         polllist.n_pids--;
1514
1515         for (i = index; i < polllist.n_pids; i++) {
1516                 polllist.pids[i] = polllist.pids[i + 1];
1517                 polllist.refcnt[i] = polllist.refcnt[i + 1];
1518                 bcopy(&polllist.fds[i + 1], &polllist.fds[i],
1519                     sizeof (struct pollfd));
1520         }
1521
1522         rcm_log_message(RCM_DEBUG, "remove pid %ld at index %d\n", pid, index);
1523
1524 done:
1525         start_polling_thread();
1526 }
1527
1528 void
1529 init_poll_thread()
1530 {
1531         polllist.poll_tid = (thread_t)-1;
1532 }
1533
1534 void
1535 cleanup_poll_thread()
1536 {
1537         (void) mutex_lock(&rcm_req_lock);
1538         if (polllist.poll_tid == thr_self()) {
1539                 rcm_log_message(RCM_TRACE2,
1540                     "cleanup_poll_thread: n_pids = %d\n", polllist.n_pids);
1541                 polllist.poll_tid = (thread_t)-1;
1542                 (void) cond_broadcast(&polllist.cv);
1543         }
1544         (void) mutex_unlock(&rcm_req_lock);
1545 }
1546
1547 /*ARGSUSED*/
1548 static void *
1549 pollfunc(void *arg)
1550 {
1551         sigset_t mask;
1552
1553         rcm_log_message(RCM_TRACE2, "poll thread started. n_pids = %d\n",
1554             polllist.n_pids);
1555
1556         /*
1557          * Unblock SIGUSR1 to allow polling thread to be killed
1558          */
1559         (void) sigemptyset(&mask);
1560         (void) sigaddset(&mask, SIGUSR1);
1561         (void) thr_sigsetmask(SIG_UNBLOCK, &mask, NULL);
1562
1563         (void) poll(polllist.fds, polllist.n_pids, (time_t)-1);
1564
1565         /*
1566          * block SIGUSR1 to avoid being killed while holding a lock
1567          */
1568         (void) sigemptyset(&mask);
1569         (void) sigaddset(&mask, SIGUSR1);
1570         (void) thr_sigsetmask(SIG_BLOCK, &mask, NULL);
1571
1572         rcm_log_message(RCM_TRACE2, "returned from poll()\n");
1573
1574         cleanup_poll_thread();
1575
1576         (void) mutex_lock(&barrier.lock);
1577         need_cleanup = 1;
1578         (void) cond_broadcast(&barrier.cv);
1579         (void) mutex_unlock(&barrier.lock);
1580
1581         return (NULL);
1582 }
1583
1584 /*
1585  * rcm_req_lock must be held
1586  */
1587 void
1588 start_polling_thread()
1589 {
1590         int err;
1591
1592         if (rcmd_get_state() != RCMD_NORMAL)
1593                 return;
1594
1595         if (polllist.poll_tid != (thread_t)-1 || polllist.n_pids == 0)
1596                 return;
1597
1598         if ((err = thr_create(NULL, 0, pollfunc, NULL, THR_DETACHED,
1599             &polllist.poll_tid)) == 0)
1600                 polllist.signaled = 0;
1601         else
1602                 rcm_log_message(RCM_ERROR,
1603                     gettext("failed to create polling thread: %s\n"),
1604                     strerror(err));
1605 }
1606
1607 /*
1608  * rcm_req_lock must be held
1609  */
1610 static void
1611 stop_polling_thread()
1612 {
1613         int err;
1614
1615         while (polllist.poll_tid != (thread_t)-1) {
1616                 if (polllist.signaled == 0) {
1617                         if ((err = thr_kill(polllist.poll_tid, SIGUSR1)) == 0)
1618                                 polllist.signaled = 1;
1619                         else
1620                                 /*
1621                                  * thr_kill shouldn't have failed since the
1622                                  * poll thread id and the signal are valid.
1623                                  * So log an error. Since when thr_kill
1624                                  * fails no signal is sent (as per man page),
1625                                  * the cond_wait below will wait until the
1626                                  * the poll thread exits by some other means.
1627                                  * The poll thread, for example, exits on its
1628                                  * own when any DR initiator process that it
1629                                  * is currently polling exits.
1630                                  */
1631                                 rcm_log_message(RCM_ERROR,
1632                                     gettext(
1633                                     "fail to kill polling thread %d: %s\n"),
1634                                     polllist.poll_tid, strerror(err));
1635                 }
1636                 (void) cond_wait(&polllist.cv, &rcm_req_lock);
1637         }
1638 }