minix/servers/mib/tree.c

   1 /* MIB service - tree.c - tree access and management */
   2
   3 #include "mib.h"
   4
   5 /*
   6  * Does the given identifier fall within the range of static identifiers in the
   7  * given parent?  This check can be used to enumerate all static array entries
   8  * in the given parent, starting from zero.  The check does not guarantee that
   9  * the entry is actually for a valid node, nor does it guarantee that there is
  10  * not a dynamic node with this identifier.
  11  */
  12 #define IS_STATIC_ID(parent, id) ((unsigned int)(id) < (parent)->node_size)
  13
  14 /*
  15  * Scratch buffer, used for various cases of temporary data storage.  It must
  16  * be large enough to fit a sysctldesc structure followed by the longest
  17  * supported description.  It must also be large enough to serve as temporary
  18  * storage for data being written in the majority of cases.  Finally, it must
  19  * be large enough to contain an entire page, for mib_copyin_str().
  20  */
  21 #define MAXDESCLEN      1024    /* from NetBSD */
  22 #define SCRATCH_SIZE    MAX(PAGE_SIZE, sizeof(struct sysctldesc) + MAXDESCLEN)
  23 static char scratch[SCRATCH_SIZE] __aligned(sizeof(int32_t));
  24
  25 unsigned int mib_nodes;         /* how many nodes are there in the tree? */
  26 unsigned int mib_objects;       /* how many memory objects are allocated? */
  27 unsigned int mib_remotes;       /* how many remote subtrees are there? */
  28
  29 /*
  30  * Find a node through its parent node and identifier.  Return the node if it
  31  * was found, and optionally store a pointer to the pointer to its dynode
  32  * superstructure (for removal).  If no matching node was found, return NULL.
  33  */
  34 static struct mib_node *
  35 mib_find(struct mib_node * parent, int id, struct mib_dynode *** prevpp)
  36 {
  37         struct mib_node *node;
  38         struct mib_dynode **dynp;
  39
  40         if (id < 0)
  41                 return NULL;
  42
  43         /*
  44          * Is there a static node with this identifier?  The static nodes are
  45          * all in a single array, so lookup is O(1) for these nodes.  We use
  46          * the node flags field to see whether the array entry is valid.
  47          */
  48         if (IS_STATIC_ID(parent, id)) {
  49                 node = &parent->node_scptr[id];
  50
  51                 if (node->node_flags != 0) {
  52                         /* Found a matching static node. */
  53                         if (prevpp != NULL)
  54                                 *prevpp = NULL;
  55                         return node;
  56                 }
  57         }
  58
  59         /*
  60          * Is there a dynamic node with this identifier?  The dynamic nodes
  61          * form a linked list.  This is predominantly because userland may pick
  62          * the identifier number at creation time, so we cannot rely on all
  63          * dynamically created nodes falling into a small identifier range.
  64          * That eliminates the option of a dynamic array indexed by identifier,
  65          * and a linked list is the simplest next option.  Thus, dynamic node
  66          * lookup is O(n).  However, since the list is sorted by identifier,
  67          * we may be able to stop the search early.
  68          */
  69         for (dynp = &parent->node_dcptr; *dynp != NULL;
  70             dynp = &((*dynp)->dynode_next)) {
  71                 if ((*dynp)->dynode_id == id) {
  72                         /* Found a matching dynamic node. */
  73                         if (prevpp != NULL)
  74                                 *prevpp = dynp;
  75                         return &(*dynp)->dynode_node;
  76                 } else if ((*dynp)->dynode_id > id)
  77                         break; /* no need to look further */
  78         }
  79
  80         return NULL;
  81 }
  82
  83 /*
  84  * Copy out a node to userland, using the exchange format for nodes (namely,
  85  * a sysctlnode structure).  Return the size of the object that is (or, if the
  86  * node falls outside the requested data range, would be) copied out on
  87  * success, or a negative error code on failure.  The function may return 0
  88  * to indicate that nothing was copied out after all (this is unused here).
  89  */
  90 static ssize_t
  91 mib_copyout_node(struct mib_call * call, struct mib_oldp * oldp, size_t off,
  92         int id, const struct mib_node * node)
  93 {
  94         struct sysctlnode scn;
  95         int visible;
  96
  97         if (!mib_inrange(oldp, off))
  98                 return sizeof(scn); /* nothing to do */
  99
 100         memset(&scn, 0, sizeof(scn));
 101
 102         /*
 103          * We use CTLFLAG_PARENT, CTLFLAG_VERIFY, and CTLFLAG_REMOTE internally
 104          * only.  NetBSD uses the values of these flags for different purposes.
 105          * Either way, do not expose them to userland.
 106          */
 107         scn.sysctl_flags = SYSCTL_VERSION | (node->node_flags &
 108             ~(CTLFLAG_PARENT | CTLFLAG_VERIFY | CTLFLAG_REMOTE));
 109         scn.sysctl_num = id;
 110         strlcpy(scn.sysctl_name, node->node_name, sizeof(scn.sysctl_name));
 111         scn.sysctl_ver = node->node_ver;
 112         scn.sysctl_size = node->node_size;
 113
 114         /* Some information is only visible if the user can access the node. */
 115         visible = (!(node->node_flags & CTLFLAG_PRIVATE) || mib_authed(call));
 116
 117         /*
 118          * For immediate types, store the immediate value in the resulting
 119          * structure, unless the caller is not authorized to obtain the value.
 120          */
 121         if ((node->node_flags & CTLFLAG_IMMEDIATE) && visible) {
 122                 switch (SYSCTL_TYPE(node->node_flags)) {
 123                 case CTLTYPE_BOOL:
 124                         scn.sysctl_bdata = node->node_bool;
 125                         break;
 126                 case CTLTYPE_INT:
 127                         scn.sysctl_idata = node->node_int;
 128                         break;
 129                 case CTLTYPE_QUAD:
 130                         scn.sysctl_qdata = node->node_quad;
 131                 }
 132         }
 133
 134         /* Special rules apply to parent nodes. */
 135         if (SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE) {
 136                 /* Report the node size the way NetBSD does, just in case. */
 137                 scn.sysctl_size = sizeof(scn);
 138
 139                 /*
 140                  * If this is a remote node, use the values we have of the root
 141                  * of the remote subtree.  If we did not have these values, we
 142                  * would have to call into the remote service here, which for
 143                  * reliability purposes is a bad idea.
 144                  *
 145                  * If this is a real parent node, report child information.  In
 146                  * both these cases, expose child information only if the node
 147                  * itself is accessible by the caller.
 148                  *
 149                  * If this is a function-driven node, indicate this by setting
 150                  * a nonzero function address.  This allows trace(1) to
 151                  * determine that it should not attempt to descend into this
 152                  * part of the tree as usual, because a) accessing subnodes may
 153                  * have side effects, and b) meta-identifiers may not work as
 154                  * expected in these parts of the tree.  Do not return the real
 155                  * function pointer, as this would leak anti-ASR information.
 156                  */
 157                 if (node->node_flags & CTLFLAG_REMOTE) {
 158                         if (visible) {
 159                                 scn.sysctl_csize = node->node_rcsize;
 160                                 scn.sysctl_clen = node->node_rclen;
 161                         }
 162                 } else if (node->node_flags & CTLFLAG_PARENT) {
 163                         if (visible) {
 164                                 scn.sysctl_csize = node->node_csize;
 165                                 scn.sysctl_clen = node->node_clen;
 166                         }
 167                 } else
 168                         scn.sysctl_func = SYSCTL_NODE_FN;
 169         }
 170
 171         /* Copy out the resulting node. */
 172         return mib_copyout(oldp, off, &scn, sizeof(scn));
 173 }
 174
 175 /*
 176  * Given a query on a non-leaf (parent) node, provide the user with an array of
 177  * this node's children.
 178  */
 179 static ssize_t
 180 mib_query(struct mib_call * call, struct mib_node * parent,
 181         struct mib_oldp * oldp, struct mib_newp * newp)
 182 {
 183         struct sysctlnode scn;
 184         struct mib_node *node;
 185         struct mib_dynode *dynode;
 186         size_t off;
 187         int r, id;
 188
 189         /* If the user passed in version numbers, check them. */
 190         if (newp != NULL) {
 191                 if ((r = mib_copyin(newp, &scn, sizeof(scn))) != OK)
 192                         return r;
 193
 194                 if (SYSCTL_VERS(scn.sysctl_flags) != SYSCTL_VERSION)
 195                         return EINVAL;
 196
 197                 /*
 198                  * If a node version number is given, it must match the version
 199                  * of the parent or the root.
 200                  */
 201                 if (scn.sysctl_ver != 0 &&
 202                     scn.sysctl_ver != mib_root.node_ver &&
 203                     scn.sysctl_ver != parent->node_ver)
 204                         return EINVAL;
 205         }
 206
 207         /*
 208          * We need not return the nodes strictly in ascending order of
 209          * identifiers, as this is not expected by userland.  For example,
 210          * sysctlgetmibinfo(3) performs its own sorting after a query.
 211          * Thus, we can go through the static and dynamic nodes separately.
 212          */
 213         off = 0;
 214
 215         /* First enumerate the static nodes. */
 216         for (id = 0; IS_STATIC_ID(parent, id); id++) {
 217                 node = &parent->node_scptr[id];
 218
 219                 if (node->node_flags == 0)
 220                         continue;
 221
 222                 if ((r = mib_copyout_node(call, oldp, off, id, node)) < 0)
 223                         return r;
 224                 off += r;
 225         }
 226
 227         /* Then enumerate the dynamic nodes. */
 228         for (dynode = parent->node_dcptr; dynode != NULL;
 229             dynode = dynode->dynode_next) {
 230                 node = &dynode->dynode_node;
 231
 232                 if ((r = mib_copyout_node(call, oldp, off, dynode->dynode_id,
 233                     node)) < 0)
 234                         return r;
 235                 off += r;
 236         }
 237
 238         return off;
 239 }
 240
 241 /*
 242  * Check whether the given name buffer contains a valid node name string.  If
 243  * the name is nonempty, properly terminated, and contains only acceptable
 244  * characters, return the length of the string excluding null terminator.
 245  * Otherwise, return zero to indicate failure.
 246  */
 247 static size_t
 248 mib_check_name(const char * name, size_t namesize)
 249 {
 250         size_t namelen;
 251         char c;
 252
 253         /* Names must be nonempty, null terminated, C symbol style strings. */
 254         for (namelen = 0; namelen < namesize; namelen++) {
 255                 if ((c = name[namelen]) == '\0')
 256                         break;
 257                 /* A-Z, a-z, 0-9, _ only, and no digit as first character. */
 258                 if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
 259                     c == '_' || (c >= '0' && c <= '9' && namelen > 0)))
 260                         return 0;
 261         }
 262         if (namelen == 0 || namelen == namesize)
 263                 return 0;
 264
 265         return namelen;
 266 }
 267
 268 /*
 269  * Scan a parent node's children, as part of new node creation.  Search for
 270  * either a free node identifier (if given_id < 0) or collisions with the node
 271  * identifier to use (if given_id >= 0).  Also check for name collisions.  Upon
 272  * success, return OK, with the resulting node identifier stored in 'idp' and a
 273  * pointer to the pointer for the new dynamic node stored in 'prevpp'.  Upon
 274  * failure, return an error code.  If the failure is EEXIST, 'idp' will contain
 275  * the ID of the conflicting node, and 'nodep' will point to the node.
 276  */
 277 static int
 278 mib_scan(struct mib_node * parent, int given_id, const char * name, int * idp,
 279         struct mib_dynode *** prevpp, struct mib_node ** nodep)
 280 {
 281         struct mib_dynode **prevp, **dynp;
 282         struct mib_node *node;
 283         int id;
 284
 285         /*
 286          * We must verify that no entry already exists with the given name.  In
 287          * addition, if a nonnegative identifier is given, we should use that
 288          * identifier and make sure it does not already exist.  Otherwise, we
 289          * must find a free identifier.  Finally, we sort the dynamic nodes in
 290          * ascending identifier order, so we must find the right place at which
 291          * to insert the new node.
 292          *
 293          * For finding a free identifier, choose an ID that falls (well)
 294          * outside the static range, both to avoid accidental retrieval by an
 295          * application that uses a static ID, and to simplify verifying that
 296          * the ID is indeed free.  The sorting of dynamic nodes by identifier
 297          * ensures that searching for a free identifier is O(n).
 298          *
 299          * At this time, we do not support some NetBSD features.  We do not
 300          * force success if the new node is sufficiently like an existing one.
 301          * Also, we do not use global autoincrement for dynamic identifiers,
 302          * although that could easily be changed.
 303          */
 304
 305         /* First check the static node array, just for collisions. */
 306         for (id = 0; IS_STATIC_ID(parent, id); id++) {
 307                 node = &parent->node_scptr[id];
 308                 if (node->node_flags == 0)
 309                         continue;
 310                 if (id == given_id || !strcmp(name, node->node_name)) {
 311                         *idp = id;
 312                         *nodep = node;
 313                         return EEXIST;
 314                 }
 315         }
 316
 317         /*
 318          * Then try to find the place to insert a new dynamic node.  At the
 319          * same time, check for both identifier and name collisions.
 320          */
 321         if (given_id >= 0)
 322                 id = given_id;
 323         else
 324                 id = MAX(CREATE_BASE, parent->node_size);
 325
 326         for (prevp = &parent->node_dcptr; *prevp != NULL;
 327             prevp = &((*prevp)->dynode_next)) {
 328                 if ((*prevp)->dynode_id > id)
 329                         break;
 330                 if ((*prevp)->dynode_id == id) {
 331                         if (given_id >= 0) {
 332                                 *idp = id;
 333                                 *nodep = &(*prevp)->dynode_node;
 334                                 return EEXIST;
 335                         } else
 336                                 id++;
 337                 }
 338                 if (!strcmp(name, (*prevp)->dynode_node.node_name)) {
 339                         *idp = (*prevp)->dynode_id;
 340                         *nodep = &(*prevp)->dynode_node;
 341                         return EEXIST;
 342                 }
 343         }
 344
 345         /* Finally, check the rest of the dynamic nodes for name collisions. */
 346         for (dynp = prevp; *dynp != NULL; dynp = &((*dynp)->dynode_next)) {
 347                 assert((*dynp)->dynode_id > id);
 348
 349                 if (!strcmp(name, (*dynp)->dynode_node.node_name)) {
 350                         *idp = (*dynp)->dynode_id;
 351                         *nodep = &(*dynp)->dynode_node;
 352                         return EEXIST;
 353                 }
 354         }
 355
 356         *idp = id;
 357         *prevpp = prevp;
 358         return OK;
 359 }
 360
 361 /*
 362  * Copy in a string from the user process, located at the given remote address,
 363  * into the given local buffer.  If no buffer is given, just compute the length
 364  * of the string.  On success, return OK.  If 'sizep' is not NULL, it will be
 365  * filled with the string size, including the null terminator.  If a non-NULL
 366  * buffer was given, the string will be copied into the provided buffer (also
 367  * including null terminator).  Return an error code on failure, which includes
 368  * the case that no null terminator was found within the range of bytes that
 369  * would fit in the given buffer.
 370  */
 371 static int
 372 mib_copyin_str(struct mib_newp * __restrict newp, vir_bytes addr,
 373         char * __restrict buf, size_t bufsize, size_t * __restrict sizep)
 374 {
 375         char *ptr, *endp;
 376         size_t chunk, len;
 377         int r;
 378
 379         assert(newp != NULL);
 380         assert(bufsize <= SSIZE_MAX);
 381
 382         if (addr == 0)
 383                 return EINVAL;
 384
 385         /*
 386          * NetBSD has a kernel routine for copying in a string from userland.
 387          * MINIX3 does not, since its system call interface has always relied
 388          * on userland passing in string lengths.  The sysctl(2) API does not
 389          * provide the string length, and thus, we have to do a bit of guess
 390          * work.  If we copy too little at once, performance suffers.  If we
 391          * copy too much at once, we may trigger an unneeded page fault.  Make
 392          * use of page boundaries to strike a balance between those two.  If we
 393          * are requested to just get the string length, use the scratch buffer.
 394          */
 395         len = 0;
 396
 397         while (bufsize > 0) {
 398                 chunk = PAGE_SIZE - (addr % PAGE_SIZE);
 399                 if (chunk > bufsize)
 400                         chunk = bufsize;
 401
 402                 ptr = (buf != NULL) ? &buf[len] : scratch;
 403                 if ((r = mib_copyin_aux(newp, addr, ptr, chunk)) != OK)
 404                         return r;
 405
 406                 if ((endp = memchr(ptr, '\0', chunk)) != NULL) {
 407                         /* A null terminator was found - success. */
 408                         if (sizep != NULL)
 409                                 *sizep = len + (size_t)(endp - ptr) + 1;
 410                         return OK;
 411                 }
 412
 413                 addr += chunk;
 414                 len += chunk;
 415                 bufsize -= chunk;
 416         }
 417
 418         /* No null terminator found. */
 419         return EINVAL;
 420 }
 421
 422 /*
 423  * Increase the version of the root node, and copy this new version to all
 424  * nodes on the path to the given node, including that node itself.
 425  */
 426 static void
 427 mib_upgrade(struct mib_node * node)
 428 {
 429         uint32_t ver;
 430
 431         assert(node != NULL);
 432
 433         /*
 434          * The root node determines the version of the entire tree.  Do not use
 435          * version number 0, as a zero version number indicates no interest in
 436          * versions elsewhere.
 437          */
 438
 439         ver = mib_root.node_ver + 1;
 440         if (ver == 0)
 441                 ver = 1;
 442
 443         /* Copy the new version to all the nodes on the path. */
 444         do {
 445                 node->node_ver = ver;
 446
 447                 node = node->node_parent;
 448         } while (node != NULL);
 449 }
 450
 451 /*
 452  * Add a new dynamically allocated node into the tree, inserting it into the
 453  * linked-list position of the parent tree as given by 'prevp'.  Also update
 454  * versions and counters accordingly.  This function never fails.
 455  */
 456 static void
 457 mib_add(struct mib_dynode * dynode, struct mib_dynode ** prevp)
 458 {
 459         struct mib_node *parent;
 460
 461         parent = dynode->dynode_node.node_parent;
 462         assert(parent != NULL);
 463
 464         /* Link the dynamic node into the list, in the right place. */
 465         assert(prevp != NULL);
 466         dynode->dynode_next = *prevp;
 467         *prevp = dynode;
 468
 469         /* The parent node now has one more child. */
 470         parent->node_csize++;
 471         parent->node_clen++;
 472
 473         /* There is now one more node in the tree. */
 474         mib_nodes++;
 475
 476         /*
 477          * Bump the version of all nodes on the path to the new node, including
 478          * the node itself.
 479          */
 480         mib_upgrade(&dynode->dynode_node);
 481 }
 482
 483 /*
 484  * Create a node.
 485  */
 486 static ssize_t
 487 mib_create(struct mib_call * call, struct mib_node * parent,
 488         struct mib_oldp * oldp, struct mib_newp * newp)
 489 {
 490         struct mib_dynode *dynode, **prevp;
 491         struct mib_node *node;
 492         struct sysctlnode scn;
 493         size_t namelen, size;
 494         ssize_t len;
 495         bool b;
 496         char c;
 497         int r, id;
 498
 499         /* This is a privileged operation. */
 500         if (!mib_authed(call))
 501                 return EPERM;
 502
 503         /*
 504          * The parent must not be a remote node, but this is already implied by
 505          * the fact that we got here at all.
 506          */
 507         assert(SYSCTL_TYPE(parent->node_flags) == CTLTYPE_NODE);
 508         assert(!(parent->node_flags & CTLFLAG_REMOTE));
 509
 510         /* The parent node must not be marked as read-only. */
 511         if (!(parent->node_flags & CTLFLAG_READWRITE))
 512                 return EPERM;
 513
 514         /*
 515          * Has the parent reached its child node limit?  This check is entirely
 516          * theoretical as long as we support only 32-bit virtual memory.
 517          */
 518         if (parent->node_csize == INT_MAX)
 519                 return EINVAL;
 520         assert(parent->node_clen <= parent->node_csize);
 521
 522         /* The caller must supply information on the child node to create. */
 523         if (newp == NULL)
 524                 return EINVAL;
 525
 526         if ((r = mib_copyin(newp, &scn, sizeof(scn))) != OK)
 527                 return r;
 528
 529         /*
 530          * We perform as many checks as possible before we start allocating
 531          * memory.  Then again, after allocation, copying in data may still
 532          * fail.  Unlike when setting values, we do not first copy data into a
 533          * temporary buffer here, because we do not need to: if the copy fails,
 534          * the entire create operation fails, so atomicity is not an issue.
 535          */
 536         if (SYSCTL_VERS(scn.sysctl_flags) != SYSCTL_VERSION)
 537                 return EINVAL;
 538
 539         /*
 540          * If a node version number is given, it must match the version of
 541          * either the parent or the root node.  The given version number is
 542          * *not* used for the node being created.
 543          */
 544         if (scn.sysctl_ver != 0 && scn.sysctl_ver != mib_root.node_ver &&
 545             scn.sysctl_ver != parent->node_ver)
 546                 return EINVAL;
 547
 548         /*
 549          * Validate the node flags.  In addition to the NetBSD-allowed flags,
 550          * we also allow UNSIGNED, and leave its interpretation to userland.
 551          */
 552         if (SYSCTL_FLAGS(scn.sysctl_flags) &
 553             ~(SYSCTL_USERFLAGS | CTLFLAG_UNSIGNED))
 554                 return EINVAL;
 555
 556         if (!(scn.sysctl_flags & CTLFLAG_IMMEDIATE)) {
 557                 /*
 558                  * Without either IMMEDIATE or OWNDATA, data pointers are
 559                  * actually kernel addresses--a concept we do not support.
 560                  * Otherwise, if IMMEDIATE is not set, we are going to have to
 561                  * allocate extra memory for the data, so force OWNDATA to be.
 562                  * set.  Node-type nodes have no data, though.
 563                  */
 564                 if (SYSCTL_TYPE(scn.sysctl_flags) != CTLTYPE_NODE) {
 565                         if (!(scn.sysctl_flags & CTLFLAG_OWNDATA) &&
 566                             scn.sysctl_data != NULL)
 567                                 return EINVAL; /* not meaningful on MINIX3 */
 568
 569                         scn.sysctl_flags |= CTLFLAG_OWNDATA;
 570                 }
 571         } else if (scn.sysctl_flags & CTLFLAG_OWNDATA)
 572                 return EINVAL;
 573
 574         /* The READWRITE flag consists of multiple bits.  Sanitize. */
 575         if (scn.sysctl_flags & CTLFLAG_READWRITE)
 576                 scn.sysctl_flags |= CTLFLAG_READWRITE;
 577
 578         /* Validate the node type and size, and do some additional checks. */
 579         switch (SYSCTL_TYPE(scn.sysctl_flags)) {
 580         case CTLTYPE_BOOL:
 581                 if (scn.sysctl_size != sizeof(bool))
 582                         return EINVAL;
 583                 break;
 584         case CTLTYPE_INT:
 585                 if (scn.sysctl_size != sizeof(int))
 586                         return EINVAL;
 587                 break;
 588         case CTLTYPE_QUAD:
 589                 if (scn.sysctl_size != sizeof(u_quad_t))
 590                         return EINVAL;
 591                 break;
 592         case CTLTYPE_STRING:
 593                 /*
 594                  * For strings, a zero length means that we are supposed to
 595                  * allocate a buffer size based on the given string size.
 596                  */
 597                 if (scn.sysctl_size == 0 && scn.sysctl_data != NULL) {
 598                         if ((r = mib_copyin_str(newp,
 599                             (vir_bytes)scn.sysctl_data, NULL, SSIZE_MAX,
 600                             &size)) != OK)
 601                                 return r;
 602                         scn.sysctl_size = size;
 603                 }
 604                 /* FALLTHROUGH */
 605         case CTLTYPE_STRUCT:
 606                 /*
 607                  * We do not set an upper size on the data size, since it would
 608                  * still be possible to create a large number of nodes, and
 609                  * this is a privileged operation ayway.
 610                  */
 611                 if (scn.sysctl_size == 0 || scn.sysctl_size > SSIZE_MAX)
 612                         return EINVAL;
 613                 if (scn.sysctl_flags & CTLFLAG_IMMEDIATE)
 614                         return EINVAL;
 615                 break;
 616         case CTLTYPE_NODE:
 617                 /*
 618                  * The zero size is an API requirement, but we also rely on the
 619                  * zero value internally, as the node has no static children.
 620                  */
 621                 if (scn.sysctl_size != 0)
 622                         return EINVAL;
 623                 if (scn.sysctl_flags & (CTLFLAG_IMMEDIATE | CTLFLAG_OWNDATA))
 624                         return EINVAL;
 625                 if (scn.sysctl_csize != 0 || scn.sysctl_clen != 0 ||
 626                     scn.sysctl_child != NULL)
 627                         return EINVAL;
 628                 break;
 629         default:
 630                 return EINVAL;
 631         }
 632
 633         if (scn.sysctl_func != NULL || scn.sysctl_parent != NULL)
 634                 return EINVAL;
 635
 636         /* Verify that the given name is valid, and get its string length. */
 637         namelen = mib_check_name(scn.sysctl_name, sizeof(scn.sysctl_name));
 638
 639         if (namelen == 0)
 640                 return EINVAL;
 641
 642         /*
 643          * Find a free identifier, or check for ID collisions if a specific
 644          * identifier was given.  At the same time, scan for name collisions,
 645          * and find the location at which to insert the new node in the list.
 646          */
 647         r = mib_scan(parent, scn.sysctl_num, scn.sysctl_name, &id, &prevp,
 648             &node);
 649
 650         if (r != OK) {
 651                 /*
 652                  * On collisions, if requested, copy out the existing node.
 653                  * This does not quite fit the general interaction model, as
 654                  * the service must now return a nonzero old length from a call
 655                  * that actually failed (in contrast to ENOMEM failures).
 656                  */
 657                 if (r == EEXIST && oldp != NULL) {
 658                         len = mib_copyout_node(call, oldp, 0, id, node);
 659
 660                         if (len > 0)
 661                                 mib_setoldlen(call, len);
 662                 }
 663
 664                 return r;
 665         }
 666
 667         /*
 668          * All checks so far have passed.  "id" now contains the new node
 669          * identifier, and "prevp" points to the pointer at which to insert the
 670          * new node in its parent's linked list of dynamic nodes.
 671          *
 672          * We can now attempt to create and initialize a new dynamic node.
 673          * Allocating nodes this way may cause heavy memory fragmentation over
 674          * time, but we do not expect the tree to see heavy modification at run
 675          * time, and the superuser has easier ways to get the MIB service in
 676          * trouble.  We note that even in low-memory conditions, the MIB
 677          * service is always able to provide basic functionality.
 678          */
 679         size = sizeof(*dynode) + namelen;
 680         if (!(scn.sysctl_flags & CTLFLAG_IMMEDIATE))
 681                 size += scn.sysctl_size;
 682
 683         if ((dynode = malloc(size)) == NULL)
 684                 return EINVAL; /* do not return ENOMEM */
 685         mib_objects++;
 686
 687         /* From here on, we have to free "dynode" before returning an error. */
 688         r = OK;
 689
 690         memset(dynode, 0, sizeof(*dynode)); /* no need to zero all of "size" */
 691         dynode->dynode_id = id;
 692         strlcpy(dynode->dynode_name, scn.sysctl_name, namelen + 1);
 693
 694         node = &dynode->dynode_node;
 695         node->node_flags = scn.sysctl_flags & ~SYSCTL_VERS_MASK;
 696         if (SYSCTL_TYPE(scn.sysctl_flags) == CTLTYPE_NODE)
 697                 node->node_flags |= CTLFLAG_PARENT;
 698         node->node_size = scn.sysctl_size;
 699         node->node_parent = parent;
 700         node->node_name = dynode->dynode_name;
 701
 702         /* Initialize the node value. */
 703         if (scn.sysctl_flags & CTLFLAG_IMMEDIATE) {
 704                 switch (SYSCTL_TYPE(scn.sysctl_flags)) {
 705                 case CTLTYPE_BOOL:
 706                         /* Sanitize booleans.  See the C99 _Bool comment. */
 707                         memcpy(&c, &scn.sysctl_bdata, sizeof(c));
 708                         node->node_bool = (bool)c;
 709                         break;
 710                 case CTLTYPE_INT:
 711                         node->node_int = scn.sysctl_idata;
 712                         break;
 713                 case CTLTYPE_QUAD:
 714                         node->node_quad = scn.sysctl_qdata;
 715                         break;
 716                 default:
 717                         assert(0);
 718                 }
 719         } else if (SYSCTL_TYPE(scn.sysctl_flags) != CTLTYPE_NODE) {
 720                 node->node_data = dynode->dynode_name + namelen + 1;
 721
 722                 /* Did the user supply initial data?  If not, use zeroes. */
 723                 if (scn.sysctl_data != NULL) {
 724                         /*
 725                          * For strings, do not copy in more than needed.  This
 726                          * is just a nice feature which allows initialization
 727                          * of large string buffers with short strings.
 728                          */
 729                         if (SYSCTL_TYPE(scn.sysctl_flags) == CTLTYPE_STRING)
 730                                 r = mib_copyin_str(newp,
 731                                     (vir_bytes)scn.sysctl_data,
 732                                     node->node_data, scn.sysctl_size, NULL);
 733                         else
 734                                 r = mib_copyin_aux(newp,
 735                                     (vir_bytes)scn.sysctl_data,
 736                                     node->node_data, scn.sysctl_size);
 737                 } else
 738                         memset(node->node_data, 0, scn.sysctl_size);
 739
 740                 /*
 741                  * Sanitize booleans.  See the C99 _Bool comment elsewhere.
 742                  * In this case it is not as big of a deal, as we will not be
 743                  * accessing the boolean value directly ourselves.
 744                  */
 745                 if (r == OK && SYSCTL_TYPE(scn.sysctl_flags) == CTLTYPE_BOOL) {
 746                         b = (bool)*(char *)node->node_data;
 747                         memcpy(node->node_data, &b, sizeof(b));
 748                 }
 749         }
 750
 751         /*
 752          * Even though it would be entirely possible to set a description right
 753          * away as well, this does not seem to be supported on NetBSD at all.
 754          */
 755
 756         /* Deal with earlier failures now. */
 757         if (r != OK) {
 758                 free(dynode);
 759                 mib_objects--;
 760
 761                 return r;
 762         }
 763
 764         /*
 765          * At this point, actual creation can no longer fail.  Add the node
 766          * into the tree, and update versions and counters.
 767          */
 768         mib_add(dynode, prevp);
 769
 770         /*
 771          * Copy out the newly created node as resulting ("old") data.  Do not
 772          * undo the creation if this fails, though.
 773          */
 774         return mib_copyout_node(call, oldp, 0, id, node);
 775 }
 776
 777 /*
 778  * Remove the given node from the tree.  If 'prevp' is NULL, the node is a
 779  * static node which should be zeroed out.  If 'prevp' is not NULL, the node is
 780  * a dynamic node which should be freed; 'prevp' will then point to the pointer
 781  * to its dynode container.  Also update versions and counters as appropriate.
 782  * This function never fails.
 783  */
 784 static void
 785 mib_remove(struct mib_node * node, struct mib_dynode ** prevp)
 786 {
 787         struct mib_dynode *dynode;
 788         struct mib_node *parent;
 789
 790         parent = node->node_parent;
 791         assert(parent != NULL);
 792
 793         /* If the description was allocated, free it. */
 794         if (node->node_flags & CTLFLAG_OWNDESC) {
 795                 free(__UNCONST(node->node_desc));
 796                 mib_objects--;
 797         }
 798
 799         /*
 800          * Static nodes only use static memory, and dynamic nodes have the data
 801          * area embedded in the dynode object.  In neither case is data memory
 802          * allocated separately, and thus, it need never be freed separately.
 803          * Therefore we *must not* check CTLFLAG_OWNDATA here.
 804          */
 805
 806         assert(parent->node_csize > 0);
 807         assert(parent->node_clen > 0);
 808
 809         /*
 810          * Dynamic nodes must be freed.  Freeing the dynode object also frees
 811          * the node name and any associated data.  Static nodes are zeroed out,
 812          * and the static memory they referenced will become inaccessible.
 813          */
 814         if (prevp != NULL) {
 815                 dynode = *prevp;
 816                 *prevp = dynode->dynode_next;
 817
 818                 assert(node == &dynode->dynode_node);
 819
 820                 free(dynode);
 821                 mib_objects--;
 822
 823                 parent->node_csize--;
 824         } else
 825                 memset(node, 0, sizeof(*node));
 826
 827         parent->node_clen--;
 828
 829         mib_nodes--;
 830
 831         /* Bump the version of all nodes on the path to the destroyed node. */
 832         mib_upgrade(parent);
 833 }
 834
 835 /*
 836  * Destroy a node.
 837  */
 838 static ssize_t
 839 mib_destroy(struct mib_call * call, struct mib_node * parent,
 840         struct mib_oldp * oldp, struct mib_newp * newp)
 841 {
 842         struct mib_dynode **prevp;
 843         struct mib_node *node;
 844         struct sysctlnode scn;
 845         ssize_t r;
 846
 847         /* This is a privileged operation. */
 848         if (!mib_authed(call))
 849                 return EPERM;
 850
 851         /* The parent node must not be marked as read-only. */
 852         if (!(parent->node_flags & CTLFLAG_READWRITE))
 853                 return EPERM;
 854
 855         /* The caller must specify which child node to destroy. */
 856         if (newp == NULL)
 857                 return EINVAL;
 858
 859         if ((r = mib_copyin(newp, &scn, sizeof(scn))) != OK)
 860                 return r;
 861
 862         if (SYSCTL_VERS(scn.sysctl_flags) != SYSCTL_VERSION)
 863                 return EINVAL;
 864
 865         /* Locate the child node. */
 866         if ((node = mib_find(parent, scn.sysctl_num, &prevp)) == NULL)
 867                 return ENOENT;
 868
 869         /* The node must not be marked as permanent. */
 870         if (node->node_flags & CTLFLAG_PERMANENT)
 871                 return EPERM;
 872
 873         /* For node-type nodes, extra rules apply. */
 874         if (SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE) {
 875                 /* The node must not be a mount point. */
 876                 if (node->node_flags & CTLFLAG_REMOTE)
 877                         return EBUSY;
 878
 879                 /* The node must not have an associated function. */
 880                 if (!(node->node_flags & CTLFLAG_PARENT))
 881                         return EPERM;
 882
 883                 /* The target node must itself not have child nodes. */
 884                 if (node->node_clen != 0)
 885                         return ENOTEMPTY;
 886         }
 887
 888         /* If the user supplied a version, it must match the node version. */
 889         if (scn.sysctl_ver != 0 && scn.sysctl_ver != node->node_ver)
 890                 return EINVAL;  /* NetBSD inconsistently throws ENOENT here */
 891
 892         /* If the user supplied a name, it must match the node name. */
 893         if (scn.sysctl_name[0] != '\0') {
 894                 if (memchr(scn.sysctl_name, '\0',
 895                     sizeof(scn.sysctl_name)) == NULL)
 896                         return EINVAL;
 897                 if (strcmp(scn.sysctl_name, node->node_name))
 898                         return EINVAL;  /* also ENOENT on NetBSD */
 899         }
 900
 901         /*
 902          * Copy out the old node if requested, otherwise return the length
 903          * anyway.  The node will be destroyed even if this call fails,
 904          * because that is how NetBSD behaves.
 905          */
 906         r = mib_copyout_node(call, oldp, 0, scn.sysctl_num, node);
 907
 908         /*
 909          * Remove the node from the tree.  The procedure depends on whether the
 910          * node is static (prevp == NULL) or dynamic (prevp != NULL).  Also
 911          * update versions and counters.
 912          */
 913         mib_remove(node, prevp);
 914
 915         return r;
 916 }
 917
 918 /*
 919  * Copy out a node description to userland, using the exchange format for node
 920  * descriptions (namely, a sysctldesc structure).  Return the size of the
 921  * object that is (or, if the description falls outside the requested data
 922  * range, would be) copied out on success, or a negative error code on failure.
 923  * The function may return 0 to indicate that nothing was copied out after all.
 924  */
 925 static ssize_t
 926 mib_copyout_desc(struct mib_call * call, struct mib_oldp * oldp, size_t off,
 927         int id, const struct mib_node * node)
 928 {
 929         struct sysctldesc *scd;
 930         size_t size;
 931         int r;
 932
 933         /* Descriptions of private nodes are considered private too. */
 934         if ((node->node_flags & CTLFLAG_PRIVATE) && !mib_authed(call))
 935                 return 0;
 936
 937         /* The description length includes the null terminator. */
 938         if (node->node_desc != NULL)
 939                 size = strlen(node->node_desc) + 1;
 940         else
 941                 size = 1;
 942
 943         assert(sizeof(*scd) + size <= sizeof(scratch));
 944
 945         scd = (struct sysctldesc *)scratch;
 946         memset(scd, 0, sizeof(*scd));
 947         scd->descr_num = id;
 948         scd->descr_ver = node->node_ver;
 949         scd->descr_len = size;
 950         if (node->node_desc != NULL)
 951                 strlcpy(scd->descr_str, node->node_desc,
 952                     sizeof(scratch) - sizeof(*scd));
 953         else
 954                 scd->descr_str[0] = '\0';
 955
 956         size += offsetof(struct sysctldesc, descr_str);
 957
 958         if ((r = mib_copyout(oldp, off, scratch, size)) < 0)
 959                 return r;
 960
 961         /*
 962          * By aligning just the size, we may leave garbage between the entries
 963          * copied out, which is fine because it is userland's own data.
 964          */
 965         return roundup2(size, sizeof(int32_t));
 966 }
 967
 968 /*
 969  * Retrieve node descriptions in bulk, or retrieve or assign a particular
 970  * node's description.
 971  */
 972 static ssize_t
 973 mib_describe(struct mib_call * call, struct mib_node * parent,
 974         struct mib_oldp * oldp, struct mib_newp * newp)
 975 {
 976         struct sysctlnode scn;
 977         struct mib_node *node;
 978         struct mib_dynode *dynode;
 979         size_t off;
 980         int r, id;
 981
 982         /* If new data are given, they identify a particular target node. */
 983         if (newp != NULL) {
 984                 if ((r = mib_copyin(newp, &scn, sizeof(scn))) != OK)
 985                         return r;
 986
 987                 if (SYSCTL_VERS(scn.sysctl_flags) != SYSCTL_VERSION)
 988                         return EINVAL;
 989
 990                 /* Locate the child node. */
 991                 if ((node = mib_find(parent, scn.sysctl_num, NULL)) == NULL)
 992                         return ENOENT;
 993
 994                 /* Descriptions of private nodes are considered private too. */
 995                 if ((node->node_flags & CTLFLAG_PRIVATE) && !mib_authed(call))
 996                         return EPERM;
 997
 998                 /*
 999                  * If a description pointer was given, this is a request to
1000                  * set the node's description.
1001                  */
1002                 if (scn.sysctl_desc != NULL) {
1003                         /* Such a request requires superuser privileges. */
1004                         if (!mib_authed(call))
1005                                 return EPERM;
1006
1007                         /*
1008                          * The node must not be a mount point.  Arguably this
1009                          * check is not necessary, since we use the description
1010                          * of the preexisting underlying node anyway.
1011                          */
1012                         if (SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE &&
1013                             (node->node_flags & CTLFLAG_REMOTE))
1014                                 return EBUSY;
1015
1016                         /* The node must not already have a description. */
1017                         if (node->node_desc != NULL)
1018                                 return EPERM;
1019
1020                         /* The node must not be marked as permanent. */
1021                         if (node->node_flags & CTLFLAG_PERMANENT)
1022                                 return EPERM;
1023
1024                         /*
1025                          * If the user supplied a version, it must match.
1026                          * NetBSD performs this check only when setting
1027                          * descriptions, and thus, so do we..
1028                          */
1029                         if (scn.sysctl_ver != 0 &&
1030                             scn.sysctl_ver != node->node_ver)
1031                                 return EINVAL;
1032
1033                         /*
1034                          * Copy in the description to the scratch buffer.
1035                          * The length of the description must be reasonable.
1036                          */
1037                         if ((r = mib_copyin_str(newp,
1038                             (vir_bytes)scn.sysctl_desc, scratch, MAXDESCLEN,
1039                             NULL)) != OK)
1040                                 return r;
1041
1042                         /* Allocate memory and store the description. */
1043                         if ((node->node_desc = strdup(scratch)) == NULL) {
1044                                 printf("MIB: out of memory!\n");
1045
1046                                 return EINVAL; /* do not return ENOMEM */
1047                         }
1048                         mib_objects++;
1049
1050                         /* The description must now be freed with the node. */
1051                         node->node_flags |= CTLFLAG_OWNDESC;
1052                 }
1053
1054                 /*
1055                  * Either way, copy out the requested node's description, which
1056                  * should indeed be the new description if one was just set.
1057                  * Note that we have already performed the permission check
1058                  * that could make this call return zero, so here it will not.
1059                  */
1060                 return mib_copyout_desc(call, oldp, 0, scn.sysctl_num, node);
1061         }
1062
1063         /* See also the considerations laid out in mib_query(). */
1064         off = 0;
1065
1066         /* First describe the static nodes. */
1067         for (id = 0; IS_STATIC_ID(parent, id); id++) {
1068                 node = &parent->node_scptr[id];
1069
1070                 if (node->node_flags == 0)
1071                         continue;
1072
1073                 if ((r = mib_copyout_desc(call, oldp, off, id, node)) < 0)
1074                         return r;
1075                 off += r;
1076         }
1077
1078         /* Then describe the dynamic nodes. */
1079         for (dynode = parent->node_dcptr; dynode != NULL;
1080             dynode = dynode->dynode_next) {
1081                 node = &dynode->dynode_node;
1082
1083                 if ((r = mib_copyout_desc(call, oldp, off, dynode->dynode_id,
1084                     node)) < 0)
1085                         return r;
1086                 off += r;
1087         }
1088
1089         return off;
1090 }
1091
1092 /*
1093  * Return a pointer to the data associated with the given node, or NULL if the
1094  * node has no associated data.  Actual calls to this function should never
1095  * result in NULL - as long as the proper rules are followed elsewhere.
1096  */
1097 static void *
1098 mib_getptr(struct mib_node * node)
1099 {
1100
1101         switch (SYSCTL_TYPE(node->node_flags)) {
1102         case CTLTYPE_BOOL:
1103                 if (node->node_flags & CTLFLAG_IMMEDIATE)
1104                         return &node->node_bool;
1105                 break;
1106         case CTLTYPE_INT:
1107                 if (node->node_flags & CTLFLAG_IMMEDIATE)
1108                         return &node->node_int;
1109                 break;
1110         case CTLTYPE_QUAD:
1111                 if (node->node_flags & CTLFLAG_IMMEDIATE)
1112                         return &node->node_quad;
1113                 break;
1114         case CTLTYPE_STRING:
1115         case CTLTYPE_STRUCT:
1116                 if (node->node_flags & CTLFLAG_IMMEDIATE)
1117                         return NULL;
1118                 break;
1119         default:
1120                 return NULL;
1121         }
1122
1123         return node->node_data;
1124 }
1125
1126 /*
1127  * Read current (old) data from a regular data node, if requested.  Return the
1128  * old data length.
1129  */
1130 static ssize_t
1131 mib_read(struct mib_node * node, struct mib_oldp * oldp)
1132 {
1133         void *ptr;
1134         size_t oldlen;
1135         int r;
1136
1137         if ((ptr = mib_getptr(node)) == NULL)
1138                 return EINVAL;
1139
1140         if (SYSCTL_TYPE(node->node_flags) == CTLTYPE_STRING)
1141                 oldlen = strlen(node->node_data) + 1;
1142         else
1143                 oldlen = node->node_size;
1144
1145         if (oldlen > SSIZE_MAX)
1146                 return EINVAL;
1147
1148         /* Copy out the current data, if requested at all. */
1149         if (oldp != NULL && (r = mib_copyout(oldp, 0, ptr, oldlen)) < 0)
1150                 return r;
1151
1152         /* Return the current length in any case. */
1153         return (ssize_t)oldlen;
1154 }
1155
1156 /*
1157  * Write new data into a regular data node, if requested.
1158  */
1159 static int
1160 mib_write(struct mib_call * call, struct mib_node * node,
1161         struct mib_newp * newp, mib_verify_ptr verify)
1162 {
1163         bool b[(sizeof(bool) == sizeof(char)) ? 1 : -1]; /* explained below */
1164         char *src, *dst;
1165         size_t newlen;
1166         int r;
1167
1168         if (newp == NULL)
1169                 return OK; /* nothing to do */
1170
1171         /*
1172          * When setting a new value, we cannot risk doing an in-place update:
1173          * the copy from userland may fail halfway through, in which case an
1174          * in-place update could leave the node value in a corrupted state.
1175          * Thus, we must first fetch any new data into a temporary buffer.
1176          *
1177          * Given that we use intermediate data storage, we could support value
1178          * swapping, where the user provides the same buffer for new and old
1179          * data.  We choose not to: NetBSD does not support it, it would make
1180          * trace(1)'s job a lot harder, and it would convolute the code here.
1181          */
1182         newlen = mib_getnewlen(newp);
1183
1184         if ((dst = mib_getptr(node)) == NULL)
1185                 return EINVAL;
1186
1187         switch (SYSCTL_TYPE(node->node_flags)) {
1188         case CTLTYPE_BOOL:
1189         case CTLTYPE_INT:
1190         case CTLTYPE_QUAD:
1191         case CTLTYPE_STRUCT:
1192                 /* Non-string types must have an exact size match. */
1193                 if (newlen != node->node_size)
1194                         return EINVAL;
1195                 break;
1196         case CTLTYPE_STRING:
1197                 /*
1198                  * Strings must not exceed their buffer size.  There is a
1199                  * second check further below, because we allow userland to
1200                  * give us an unterminated string.  In that case we terminate
1201                  * it ourselves, but then the null terminator must fit as well.
1202                  */
1203                 if (newlen > node->node_size)
1204                         return EINVAL;
1205                 break;
1206         default:
1207                 return EINVAL;
1208         }
1209
1210         /*
1211          * If we cannot fit the data in the scratch buffer, then allocate a
1212          * temporary buffer.  We add one extra byte so that we can add a null
1213          * terminator at the end of strings in case userland did not supply
1214          * one.  Either way, we must free the temporary buffer later!
1215          *
1216          * The alternative is to ensure that the given memory is accessible
1217          * before starting the copy, but that would break if we ever add kernel
1218          * threads or anything that allows asynchronous memory unmapping, etc.
1219          */
1220         if (newlen + 1 > sizeof(scratch)) {
1221                 /*
1222                  * In practice, the temporary buffer is at least an entire
1223                  * memory page, which is reasonable by any standard.  As a
1224                  * result, we can get away with refusing to perform dynamic
1225                  * allocation for unprivileged users.  This limits the impact
1226                  * that unprivileged users can have on our memory space.
1227                  */
1228                 if (!mib_authed(call))
1229                         return EPERM;
1230
1231                 /*
1232                  * Do not return ENOMEM on allocation failure, because ENOMEM
1233                  * implies that a valid old length was returned.
1234                  */
1235                 if ((src = malloc(newlen + 1)) == NULL) {
1236                         printf("MIB: out of memory!\n");
1237
1238                         return EINVAL;
1239                 }
1240                 mib_objects++;
1241         } else
1242                 src = scratch;
1243
1244         /* Copy in the data.  Note that newlen may be zero. */
1245         r = mib_copyin(newp, src, newlen);
1246
1247         if (r == OK && verify != NULL && !verify(call, node, src, newlen))
1248                 r = EINVAL;
1249
1250         if (r == OK) {
1251                 /* Check and, if acceptable, store the new value. */
1252                 switch (SYSCTL_TYPE(node->node_flags)) {
1253                 case CTLTYPE_BOOL:
1254                         /*
1255                          * Due to the nature of the C99 _Bool type, we can not
1256                          * test directly whether the given boolean value is a
1257                          * value that is not "true" and not "false".  In the
1258                          * worst case, another value could invoke undefined
1259                          * behavior.  We try our best to sanitize the value
1260                          * without looking at it directly, which unfortunately
1261                          * requires us to test for the size of the bool type.
1262                          * We do that at compile time, hence the 'b' "array".
1263                          * Any size other than one byte is an ABI violation.
1264                          */
1265                         b[0] = (bool)src[0];
1266                         memcpy(dst, &b[0], sizeof(b[0]));
1267                         break;
1268                 case CTLTYPE_INT:
1269                 case CTLTYPE_QUAD:
1270                 case CTLTYPE_STRUCT:
1271                         memcpy(dst, src, node->node_size);
1272                         break;
1273                 case CTLTYPE_STRING:
1274                         if (newlen == node->node_size &&
1275                             src[newlen - 1] != '\0') {
1276                                 /* Our null terminator does not fit! */
1277                                 r = EINVAL;
1278                                 break;
1279                         }
1280                         /*
1281                          * We do not mind null characters in the middle.  In
1282                          * general, the buffer may contain garbage after the
1283                          * first null terminator, but such garbage will never
1284                          * end up being copied out.
1285                          */
1286                         src[newlen] = '\0';
1287                         strlcpy(dst, src, node->node_size);
1288                         break;
1289                 default:
1290                         r = EINVAL;
1291                 }
1292         }
1293
1294         if (src != scratch) {
1295                 free(src);
1296                 mib_objects--;
1297         }
1298
1299         return r;
1300 }
1301
1302 /*
1303  * Read and/or write the value of a regular data node.  A regular data node is
1304  * a leaf node.  Typically, a leaf node has no associated function, in which
1305  * case this function will be used instead.  In addition, this function may be
1306  * used from handler functions as part of their functionality.
1307  */
1308 ssize_t
1309 mib_readwrite(struct mib_call * call, struct mib_node * node,
1310         struct mib_oldp * oldp, struct mib_newp * newp, mib_verify_ptr verify)
1311 {
1312         ssize_t len;
1313         int r;
1314
1315         /* Copy out old data, if requested.  Always get the old data length. */
1316         if ((r = len = mib_read(node, oldp)) < 0)
1317                 return r;
1318
1319         /* Copy in new data, if requested. */
1320         if ((r = mib_write(call, node, newp, verify)) != OK)
1321                 return r;
1322
1323         /* Return the old data length. */
1324         return len;
1325 }
1326
1327 /*
1328  * Dispatch a sysctl call, by looking up the target node by its MIB name and
1329  * taking the appropriate action on the resulting node, if found.  Return the
1330  * old data length on success, or a negative error code on failure.
1331  */
1332 ssize_t
1333 mib_dispatch(struct mib_call * call, struct mib_oldp * oldp,
1334         struct mib_newp * newp)
1335 {
1336         struct mib_node *parent, *node;
1337         ssize_t r;
1338         int id, is_leaf, can_restart, has_verify, has_func;
1339
1340         assert(call->call_namelen <= CTL_MAXNAME);
1341
1342         /*
1343          * Resolve the name by descending into the node tree, level by level,
1344          * starting at the MIB root.
1345          */
1346         for (parent = &mib_root; call->call_namelen > 0; parent = node) {
1347                 id = call->call_name[0];
1348                 call->call_name++;
1349                 call->call_namelen--;
1350
1351                 assert(SYSCTL_TYPE(parent->node_flags) == CTLTYPE_NODE);
1352                 assert(parent->node_flags & CTLFLAG_PARENT);
1353
1354                 /*
1355                  * Check for meta-identifiers.  Regular identifiers are never
1356                  * negative, although node handler functions may take subpaths
1357                  * with negative identifiers that are not meta-identifiers
1358                  * (e.g., see KERN_PROC2).
1359                  */
1360                 if (id < 0) {
1361                         /*
1362                          * A meta-identifier must always be the last name
1363                          * component.
1364                          */
1365                         if (call->call_namelen > 0)
1366                                 return EINVAL;
1367
1368                         switch (id) {
1369                         case CTL_QUERY:
1370                                 return mib_query(call, parent, oldp, newp);
1371                         case CTL_CREATE:
1372                                 return mib_create(call, parent, oldp, newp);
1373                         case CTL_DESTROY:
1374                                 return mib_destroy(call, parent, oldp, newp);
1375                         case CTL_DESCRIBE:
1376                                 return mib_describe(call, parent, oldp, newp);
1377                         case CTL_CREATESYM:
1378                         case CTL_MMAP:
1379                         default:
1380                                 return EOPNOTSUPP;
1381                         }
1382                 }
1383
1384                 /* Locate the child node. */
1385                 if ((node = mib_find(parent, id, NULL /*prevp*/)) == NULL)
1386                         return ENOENT;
1387
1388                 /* Check if access is permitted at this level. */
1389                 if ((node->node_flags & CTLFLAG_PRIVATE) && !mib_authed(call))
1390                         return EPERM;
1391
1392                 /*
1393                  * Start by checking if the node is a remote node.  If so, let
1394                  * a remote service handle the remainder of this request.
1395                  * However, as part of attempting the remote call, we may
1396                  * discover that the remote service has died or that it is
1397                  * unmounting the subtree.  If the node was not a temporary
1398                  * mountpoint, we should (and do) continue with the request
1399                  * locally - if it was, it will already be deallocated and we
1400                  * must be very careful not to access 'node' again!
1401                  */
1402                 is_leaf = (SYSCTL_TYPE(node->node_flags) != CTLTYPE_NODE);
1403
1404                 if (!is_leaf && (node->node_flags & CTLFLAG_REMOTE)) {
1405                         /* Determine this before 'node' may disappear.. */
1406                         can_restart = (node->node_flags & CTLFLAG_PARENT);
1407
1408                         r = mib_remote_call(call, node, oldp, newp);
1409
1410                         if (r != ERESTART || !can_restart)
1411                                 return (r != ERESTART) ? r : ENOENT;
1412
1413                         /* Service died, subtree is unmounted, keep going. */
1414                         assert(SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE);
1415                         assert(!(node->node_flags & CTLFLAG_REMOTE));
1416                 }
1417
1418                 /*
1419                  * Is this a leaf node, and/or is this node handled by a
1420                  * function?  If either is true, resolution ends at this level.
1421                  * In order to save a few bytes of memory per node, we use
1422                  * different ways to determine whether there is a function
1423                  * depending on whether the node is a leaf or not.
1424                  */
1425                 if (is_leaf) {
1426                         has_verify = (node->node_flags & CTLFLAG_VERIFY);
1427                         has_func = (!has_verify && node->node_func != NULL);
1428                 } else {
1429                         has_verify = FALSE;
1430                         has_func = !(node->node_flags & CTLFLAG_PARENT);
1431                 }
1432
1433                 /*
1434                  * The name may be longer only if the node is not a leaf.  That
1435                  * also applies to leaves with functions, so check this first.
1436                  */
1437                 if (is_leaf && call->call_namelen > 0)
1438                         return ENOTDIR;
1439
1440                 /*
1441                  * If resolution indeed ends here, and the user supplied new
1442                  * data, check if writing is allowed.  For functions, it is
1443                  * arguable whether we should do this check here already.
1444                  * However, for now, this approach covers all our use cases.
1445                  */
1446                 if ((is_leaf || has_func) && newp != NULL) {
1447                         if (!(node->node_flags & CTLFLAG_READWRITE))
1448                                 return EPERM;
1449
1450                         /*
1451                          * Unless nonprivileged users may write to this node,
1452                          * ensure that the user has superuser privileges.  The
1453                          * ANYWRITE flag does not override the READWRITE flag.
1454                          */
1455                         if (!(node->node_flags & CTLFLAG_ANYWRITE) &&
1456                             !mib_authed(call))
1457                                 return EPERM;
1458                 }
1459
1460                 /* If this node has a handler function, let it do the work. */
1461                 if (has_func)
1462                         return node->node_func(call, node, oldp, newp);
1463
1464                 /* For regular data leaf nodes, handle generic access. */
1465                 if (is_leaf)
1466                         return mib_readwrite(call, node, oldp, newp,
1467                             has_verify ? node->node_verify : NULL);
1468
1469                 /* No function and not a leaf?  Descend further. */
1470         }
1471
1472         /* If we get here, the name refers to a node array. */
1473         return EISDIR;
1474 }
1475
1476 /*
1477  * Recursively initialize the static tree at initialization time.
1478  */
1479 static void
1480 mib_tree_recurse(struct mib_node * parent)
1481 {
1482         struct mib_node *node;
1483         int id;
1484
1485         assert(SYSCTL_TYPE(parent->node_flags) == CTLTYPE_NODE);
1486         assert(parent->node_flags & CTLFLAG_PARENT);
1487
1488         /*
1489          * Later on, node_csize and node_clen will also include dynamically
1490          * created nodes.  This means that we cannot use node_csize to iterate
1491          * over the static nodes.
1492          */
1493         parent->node_csize = parent->node_size;
1494
1495         node = parent->node_scptr;
1496
1497         for (id = 0; IS_STATIC_ID(parent, id); id++, node++) {
1498                 if (node->node_flags == 0)
1499                         continue;
1500
1501                 mib_nodes++;
1502
1503                 parent->node_clen++;
1504
1505                 node->node_ver = parent->node_ver;
1506                 node->node_parent = parent;
1507
1508                 /* Recursively apply this function to all node children. */
1509                 if (SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE &&
1510                     (node->node_flags & CTLFLAG_PARENT))
1511                         mib_tree_recurse(node);
1512         }
1513 }
1514
1515 /*
1516  * Go through the entire static tree, recursively, initializing some values
1517  * that could not be assigned at compile time.
1518  */
1519 void
1520 mib_tree_init(void)
1521 {
1522
1523         /* Initialize some variables. */
1524         mib_nodes = 1; /* the root node itself */
1525         mib_objects = 0;
1526
1527         /*
1528          * The entire tree starts with the same, nonzero node version.
1529          * The root node is the only node without a parent.
1530          */
1531         mib_root.node_ver = 1;
1532         mib_root.node_parent = NULL;
1533
1534         /* Recursively initialize the static tree. */
1535         mib_tree_recurse(&mib_root);
1536 }
1537
1538 /*
1539  * Process a subtree mount request from a remote service.  Return OK on
1540  * success, with a pointer to the resulting static-node structure stored in
1541  * 'nodep'.  Return a negative error code on failure.
1542  */
1543 int
1544 mib_mount(const int * mib, unsigned int miblen, unsigned int eid, uint32_t rid,
1545         uint32_t flags, unsigned int csize, unsigned int clen,
1546         struct mib_node ** nodep)
1547 {
1548         struct mib_dynode *dynode, **prevp;
1549         struct mib_node *parent, *node;
1550         char name[SYSCTL_NAMELEN], *desc;
1551         size_t size, namelen, desclen;
1552         unsigned int n;
1553         int r, id;
1554
1555         /*
1556          * Perform initial verification of the given parameters.  Even stricter
1557          * checks may be performed later.
1558          */
1559         /*
1560          * By policy, we forbid mounting top-level nodes.  This is in effect
1561          * also the only security-like restriction: a service should not be
1562          * able to just take over, say, the entire "kern" subtree.  There is
1563          * currently little in the way of a service taking over an important
1564          * set of second-level nodes, though.
1565          *
1566          * TODO: allow mounting of predefined mount points only, for example by
1567          * having an internal node flag that permits mounting the subtree or
1568          * any node in it.  As an even better alternative, allow this to be
1569          * controlled through a policy specification; unfortunately, this would
1570          * also add a substantial amount of infrastructure.
1571          */
1572         if (miblen < 2) {
1573                 MIB_DEBUG_MOUNT(("MIB: mounting failed, path too short\n"));
1574
1575                 return EPERM;
1576         }
1577
1578         /*
1579          * The flags field is highly restricted right now.  Only a few flags
1580          * may be given at all, and then when using an existing node as mount
1581          * point, the flag must exactly match the existing node's flags.
1582          */
1583         if (SYSCTL_VERS(flags) != SYSCTL_VERSION ||
1584             SYSCTL_TYPE(flags) != CTLTYPE_NODE ||
1585             (SYSCTL_FLAGS(flags) & ~(CTLFLAG_READONLY | CTLFLAG_READWRITE |
1586             CTLFLAG_PERMANENT | CTLFLAG_HIDDEN)) != 0) {
1587                 MIB_DEBUG_MOUNT(("MIB: mounting failed, invalid flags %"PRIx32
1588                     "\n", flags));
1589
1590                 return EINVAL;
1591         }
1592
1593         if (csize > (1U << MIB_RC_BITS) || clen > csize) {
1594                 MIB_DEBUG_MOUNT(("MIB: mounting failed, invalid child size or "
1595                     "length (%u, %u)\n", csize, clen));
1596
1597                 return EINVAL;
1598         }
1599
1600         /*
1601          * Look up the parent node of the mount point.  This parent node must
1602          * exist - we don't want to create more than one temporary node in any
1603          * case.  All the nodes leading up to and including the parent node
1604          * must be real, local, non-private, node-type nodes.  The path may not
1605          * be private, because that would allow an unprivileged service to
1606          * intercept writes to privileged nodes--currently a total nonissue in
1607          * practice, but still.  Note that the service may itself restrict
1608          * access to nodes in its own mounted subtree in any way it wishes.
1609          */
1610         parent = &mib_root;
1611
1612         for (n = 0; n < miblen - 1; n++) {
1613                 /* Meta-identifiers are obviously not allowed in the path. */
1614                 if ((id = mib[n]) < 0) {
1615                         MIB_DEBUG_MOUNT(("MIB: mounting failed, meta-ID in "
1616                             "path\n"));
1617
1618                         return EINVAL;
1619                 }
1620
1621                 /* Locate the child node. */
1622                 if ((node = mib_find(parent, id, NULL /*prevp*/)) == NULL) {
1623                         MIB_DEBUG_MOUNT(("MIB: mounting failed, path not "
1624                             "found\n"));
1625
1626                         return ENOENT;
1627                 }
1628
1629                 /* Make sure it is a regular node-type node. */
1630                 if (SYSCTL_TYPE(node->node_flags) != CTLTYPE_NODE ||
1631                     !(node->node_flags & CTLFLAG_PARENT) ||
1632                     (node->node_flags & (CTLFLAG_REMOTE | CTLFLAG_PRIVATE))) {
1633                         MIB_DEBUG_MOUNT(("MIB: mounting failed, unacceptable "
1634                             "node on path\n"));
1635
1636                         return EPERM;
1637                 }
1638
1639                 parent = node;
1640         }
1641
1642         /* Now see if the mount point itself exists. */
1643         if ((id = mib[miblen - 1]) < 0) {
1644                 MIB_DEBUG_MOUNT(("MIB: mounting failed, meta-ID in path\n"));
1645
1646                 return EINVAL;
1647         }
1648
1649         /*
1650          * If the target node exists and passes all tests, it will simply be
1651          * converted to a mount point.  If the target node does not exist, we
1652          * have to allocate a temporary node as mount point.
1653          */
1654         if ((node = mib_find(parent, id, NULL /*prevp*/)) != NULL) {
1655                 /*
1656                  * We are about to mount on an existing node.  As stated above,
1657                  * the node flags must match the given flags exactly.
1658                  */
1659                 if (SYSCTL_TYPE(node->node_flags) != CTLTYPE_NODE ||
1660                     SYSCTL_FLAGS(node->node_flags) !=
1661                     (SYSCTL_FLAGS(flags) | CTLFLAG_PARENT)) {
1662                         MIB_DEBUG_MOUNT(("MIB: mounting failed, target node "
1663                             "mismatch (%"PRIx32", %"PRIx32")\n",
1664                             node->node_flags, flags));
1665
1666                         return EPERM;
1667                 }
1668
1669                 /*
1670                  * If the node has dynamically added children, we will not be
1671                  * able to restore the node to its old state when unmounting.
1672                  */
1673                 if (node->node_size != node->node_csize) {
1674                         MIB_DEBUG_MOUNT(("MIB: mounting failed, node has "
1675                             "dynamic children\n"));
1676
1677                         return EBUSY;
1678                 }
1679
1680                 mib_upgrade(node);
1681         } else {
1682                 /*
1683                  * We are going to create a temporary mount point.  Much of the
1684                  * procedure that follows is a rather selective extract from
1685                  * mib_create().  Start with a check for the impossible.
1686                  */
1687                 if (parent->node_csize == INT_MAX) {
1688                         MIB_DEBUG_MOUNT(("MIB: mounting failed, parent node "
1689                             "full\n"));
1690
1691                         return EINVAL;
1692                 }
1693
1694                 /*
1695                  * In order to create the new node, we also need the node's
1696                  * name and description; those did not fit in the request
1697                  * message.  Ask the caller to copy these strings to us.
1698                  */
1699                 name[0] = '\0';
1700                 scratch[0] = '\0';
1701
1702                 if ((r = mib_remote_info(eid, rid, name, sizeof(name), scratch,
1703                     MAXDESCLEN)) != OK) {
1704                         MIB_DEBUG_MOUNT(("MIB: mounting failed, node info "
1705                             "request yielded %d\n", r));
1706
1707                         return r;
1708                 }
1709
1710                 /* Make sure the name is valid. */
1711                 if ((namelen = mib_check_name(name, sizeof(name))) == 0) {
1712                         printf("MIB: mounting failed, bad name\n");
1713
1714                         return EINVAL;
1715                 }
1716
1717                 /* Just forcefully terminate the description. */
1718                 scratch[MAXDESCLEN - 1] = '\0';
1719                 desclen = strlen(scratch);
1720
1721                 /*
1722                  * We know the identifier is not in use yet; make sure that the
1723                  * name is not, either.  As a side effect, find out where the
1724                  * new node should be inserted upon success.
1725                  */
1726                 if (mib_scan(parent, id, name, &id /*unused*/, &prevp,
1727                     &node /*unused*/) != OK) {
1728                         MIB_DEBUG_MOUNT(("MIB: mounting failed, name "
1729                             "conflict\n"));
1730
1731                         return EEXIST;
1732                 }
1733
1734                 /*
1735                  * Allocate a dynamic node.  Unlike for user-created dynamic
1736                  * nodes, temporary mount points also include the description
1737                  * in the dynode object.
1738                  */
1739                 size = sizeof(*dynode) + namelen + desclen + 1;
1740
1741                 if ((dynode = malloc(size)) == NULL) {
1742                         printf("MIB: out of memory!\n");
1743
1744                         return ENOMEM;
1745                 }
1746                 mib_objects++;
1747
1748                 /* Initialize the dynamic node. */
1749                 memset(dynode, 0, sizeof(*dynode));
1750                 dynode->dynode_id = id;
1751                 strlcpy(dynode->dynode_name, name, namelen + 1);
1752                 desc = &dynode->dynode_name[namelen + 1];
1753                 strlcpy(desc, scratch, desclen + 1);
1754
1755                 node = &dynode->dynode_node;
1756                 node->node_flags = flags & ~SYSCTL_VERS_MASK;
1757                 node->node_size = 0;
1758                 node->node_parent = parent;
1759                 node->node_name = dynode->dynode_name;
1760                 node->node_desc = desc;
1761
1762                 /*
1763                  * Add the new dynamic node into the tree, and adjust versions
1764                  * and counters.
1765                  */
1766                 mib_add(dynode, prevp);
1767         }
1768
1769         /* Success!  Perform the actual mount, and return the target node. */
1770         node->node_flags |= CTLFLAG_REMOTE;
1771         node->node_eid = eid;
1772         node->node_rcsize = csize;
1773         node->node_rclen = clen;
1774         node->node_rid = rid;
1775
1776         mib_remotes++;
1777
1778         *nodep = node;
1779         return OK;
1780 }
1781
1782 /*
1783  * Unmount the remote subtree identified by the given node.  Release the mount
1784  * point by reversing the action performed while mounting.  Also bump the
1785  * version numbers on the path, so that userland knows that it is to expect a
1786  * change of contents in the subtree.  This function always succeeds, and may
1787  * deallocate the given node.
1788  */
1789 void
1790 mib_unmount(struct mib_node * node)
1791 {
1792         struct mib_dynode **prevp;
1793         struct mib_node *child;
1794         int id;
1795
1796         assert(SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE);
1797         assert(node->node_flags & CTLFLAG_REMOTE);
1798
1799         /*
1800          * Given that the node has the CTLFLAG_REMOTE flag set, we can now tell
1801          * whether the remote subtree obscured a preexisting node or we created
1802          * a temporary mount point, by checking its CTLFLAG_PARENT flag.
1803          */
1804         if (node->node_flags & CTLFLAG_PARENT) {
1805                 /*
1806                  * Return the node to its former pre-mount state.  Restore the
1807                  * original node_clen field by recomputing it.
1808                  */
1809                 node->node_flags &= ~CTLFLAG_REMOTE;
1810                 node->node_csize = node->node_size;
1811                 node->node_clen = 0;
1812
1813                 for (id = 0; IS_STATIC_ID(node, id); id++) {
1814                         child = &node->node_scptr[id];
1815
1816                         if (child->node_flags != 0)
1817                                 node->node_clen++;
1818                 }
1819
1820                 node->node_dcptr = NULL;
1821
1822                 /* Increase version numbers on the path to the node. */
1823                 mib_upgrade(node);
1824         } else {
1825                 /*
1826                  * We know that we dynamically allocated this node; find its
1827                  * parent's pointer to it.
1828                  */
1829                 for (prevp = &node->node_parent->node_dcptr; *prevp != NULL;
1830                     prevp = &(*prevp)->dynode_next) {
1831                         if (&(*prevp)->dynode_node == node)
1832                                 break;
1833                 }
1834                 assert(*prevp != NULL);
1835
1836                 /* Free the node, and adjust counts and versions. */
1837                 mib_remove(node, prevp);
1838         }
1839
1840         assert(mib_remotes > 0);
1841         mib_remotes--;
1842 }