sys/miscfs/procfs/procfs_vnops.c

   1 /*      $NetBSD: procfs_vnops.c,v 1.176 2009/07/03 21:17:42 elad Exp $  */
   2
   3 /*-
   4  * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Andrew Doran.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  * Copyright (c) 1993, 1995
  34  *      The Regents of the University of California.  All rights reserved.
  35  *
  36  * This code is derived from software contributed to Berkeley by
  37  * Jan-Simon Pendry.
  38  *
  39  * Redistribution and use in source and binary forms, with or without
  40  * modification, are permitted provided that the following conditions
  41  * are met:
  42  * 1. Redistributions of source code must retain the above copyright
  43  *    notice, this list of conditions and the following disclaimer.
  44  * 2. Redistributions in binary form must reproduce the above copyright
  45  *    notice, this list of conditions and the following disclaimer in the
  46  *    documentation and/or other materials provided with the distribution.
  47  * 3. Neither the name of the University nor the names of its contributors
  48  *    may be used to endorse or promote products derived from this software
  49  *    without specific prior written permission.
  50  *
  51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  61  * SUCH DAMAGE.
  62  *
  63  *      @(#)procfs_vnops.c      8.18 (Berkeley) 5/21/95
  64  */
  65
  66 /*
  67  * Copyright (c) 1993 Jan-Simon Pendry
  68  *
  69  * This code is derived from software contributed to Berkeley by
  70  * Jan-Simon Pendry.
  71  *
  72  * Redistribution and use in source and binary forms, with or without
  73  * modification, are permitted provided that the following conditions
  74  * are met:
  75  * 1. Redistributions of source code must retain the above copyright
  76  *    notice, this list of conditions and the following disclaimer.
  77  * 2. Redistributions in binary form must reproduce the above copyright
  78  *    notice, this list of conditions and the following disclaimer in the
  79  *    documentation and/or other materials provided with the distribution.
  80  * 3. All advertising materials mentioning features or use of this software
  81  *    must display the following acknowledgement:
  82  *      This product includes software developed by the University of
  83  *      California, Berkeley and its contributors.
  84  * 4. Neither the name of the University nor the names of its contributors
  85  *    may be used to endorse or promote products derived from this software
  86  *    without specific prior written permission.
  87  *
  88  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  89  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  90  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  91  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  92  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  93  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  94  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  95  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  96  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  97  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  98  * SUCH DAMAGE.
  99  *
 100  *      @(#)procfs_vnops.c      8.18 (Berkeley) 5/21/95
 101  */
 102
 103 /*
 104  * procfs vnode interface
 105  */
 106
 107 #include <sys/cdefs.h>
 108 __KERNEL_RCSID(0, "$NetBSD: procfs_vnops.c,v 1.176 2009/07/03 21:17:42 elad Exp $");
 109
 110 #include <sys/param.h>
 111 #include <sys/systm.h>
 112 #include <sys/time.h>
 113 #include <sys/kernel.h>
 114 #include <sys/file.h>
 115 #include <sys/filedesc.h>
 116 #include <sys/proc.h>
 117 #include <sys/vnode.h>
 118 #include <sys/namei.h>
 119 #include <sys/malloc.h>
 120 #include <sys/mount.h>
 121 #include <sys/dirent.h>
 122 #include <sys/resourcevar.h>
 123 #include <sys/stat.h>
 124 #include <sys/ptrace.h>
 125 #include <sys/kauth.h>
 126
 127 #include <uvm/uvm_extern.h>     /* for PAGE_SIZE */
 128
 129 #include <machine/reg.h>
 130
 131 #include <miscfs/genfs/genfs.h>
 132 #include <miscfs/procfs/procfs.h>
 133
 134 /*
 135  * Vnode Operations.
 136  *
 137  */
 138
 139 static int procfs_validfile_linux(struct lwp *, struct mount *);
 140 static int procfs_root_readdir_callback(struct proc *, void *);
 141 static void procfs_dir(pfstype, struct lwp *, struct proc *, char **, char *,
 142     size_t);
 143
 144 /*
 145  * This is a list of the valid names in the
 146  * process-specific sub-directories.  It is
 147  * used in procfs_lookup and procfs_readdir
 148  */
 149 static const struct proc_target {
 150         u_char  pt_type;
 151         u_char  pt_namlen;
 152         const char      *pt_name;
 153         pfstype pt_pfstype;
 154         int     (*pt_valid)(struct lwp *, struct mount *);
 155 } proc_targets[] = {
 156 #define N(s) sizeof(s)-1, s
 157         /*        name          type            validp */
 158         { DT_DIR, N("."),       PFSproc,        NULL },
 159         { DT_DIR, N(".."),      PFSroot,        NULL },
 160         { DT_DIR, N("fd"),      PFSfd,          NULL },
 161         { DT_REG, N("file"),    PFSfile,        procfs_validfile },
 162         { DT_REG, N("mem"),     PFSmem,         NULL },
 163         { DT_REG, N("regs"),    PFSregs,        procfs_validregs },
 164         { DT_REG, N("fpregs"),  PFSfpregs,      procfs_validfpregs },
 165         { DT_REG, N("ctl"),     PFSctl,         NULL },
 166         { DT_REG, N("stat"),    PFSstat,        procfs_validfile_linux },
 167         { DT_REG, N("status"),  PFSstatus,      NULL },
 168         { DT_REG, N("note"),    PFSnote,        NULL },
 169         { DT_REG, N("notepg"),  PFSnotepg,      NULL },
 170         { DT_REG, N("map"),     PFSmap,         procfs_validmap },
 171         { DT_REG, N("maps"),    PFSmaps,        procfs_validmap },
 172         { DT_REG, N("cmdline"), PFScmdline,     NULL },
 173         { DT_REG, N("exe"),     PFSexe,         procfs_validfile },
 174         { DT_LNK, N("cwd"),     PFScwd,         NULL },
 175         { DT_LNK, N("root"),    PFSchroot,      NULL },
 176         { DT_LNK, N("emul"),    PFSemul,        NULL },
 177         { DT_REG, N("statm"),   PFSstatm,       procfs_validfile_linux },
 178 #ifdef __HAVE_PROCFS_MACHDEP
 179         PROCFS_MACHDEP_NODETYPE_DEFNS
 180 #endif
 181 #undef N
 182 };
 183 static const int nproc_targets = sizeof(proc_targets) / sizeof(proc_targets[0]);
 184
 185 /*
 186  * List of files in the root directory. Note: the validate function will
 187  * be called with p == NULL for these ones.
 188  */
 189 static const struct proc_target proc_root_targets[] = {
 190 #define N(s) sizeof(s)-1, s
 191         /*        name              type            validp */
 192         { DT_REG, N("meminfo"),     PFSmeminfo,        procfs_validfile_linux },
 193         { DT_REG, N("cpuinfo"),     PFScpuinfo,        procfs_validfile_linux },
 194         { DT_REG, N("uptime"),      PFSuptime,         procfs_validfile_linux },
 195         { DT_REG, N("mounts"),      PFSmounts,         procfs_validfile_linux },
 196         { DT_REG, N("devices"),     PFSdevices,        procfs_validfile_linux },
 197         { DT_REG, N("stat"),        PFScpustat,        procfs_validfile_linux },
 198         { DT_REG, N("loadavg"),     PFSloadavg,        procfs_validfile_linux },
 199 #undef N
 200 };
 201 static const int nproc_root_targets =
 202     sizeof(proc_root_targets) / sizeof(proc_root_targets[0]);
 203
 204 int     procfs_lookup(void *);
 205 #define procfs_create   genfs_eopnotsupp
 206 #define procfs_mknod    genfs_eopnotsupp
 207 int     procfs_open(void *);
 208 int     procfs_close(void *);
 209 int     procfs_access(void *);
 210 int     procfs_getattr(void *);
 211 int     procfs_setattr(void *);
 212 #define procfs_read     procfs_rw
 213 #define procfs_write    procfs_rw
 214 #define procfs_fcntl    genfs_fcntl
 215 #define procfs_ioctl    genfs_enoioctl
 216 #define procfs_poll     genfs_poll
 217 #define procfs_revoke   genfs_revoke
 218 #define procfs_fsync    genfs_nullop
 219 #define procfs_seek     genfs_nullop
 220 #define procfs_remove   genfs_eopnotsupp
 221 int     procfs_link(void *);
 222 #define procfs_rename   genfs_eopnotsupp
 223 #define procfs_mkdir    genfs_eopnotsupp
 224 #define procfs_rmdir    genfs_eopnotsupp
 225 int     procfs_symlink(void *);
 226 int     procfs_readdir(void *);
 227 int     procfs_readlink(void *);
 228 #define procfs_abortop  genfs_abortop
 229 int     procfs_inactive(void *);
 230 int     procfs_reclaim(void *);
 231 #define procfs_lock     genfs_lock
 232 #define procfs_unlock   genfs_unlock
 233 #define procfs_bmap     genfs_badop
 234 #define procfs_strategy genfs_badop
 235 int     procfs_print(void *);
 236 int     procfs_pathconf(void *);
 237 #define procfs_islocked genfs_islocked
 238 #define procfs_advlock  genfs_einval
 239 #define procfs_bwrite   genfs_eopnotsupp
 240 #define procfs_putpages genfs_null_putpages
 241
 242 static int atoi(const char *, size_t);
 243
 244 /*
 245  * procfs vnode operations.
 246  */
 247 int (**procfs_vnodeop_p)(void *);
 248 const struct vnodeopv_entry_desc procfs_vnodeop_entries[] = {
 249         { &vop_default_desc, vn_default_error },
 250         { &vop_lookup_desc, procfs_lookup },            /* lookup */
 251         { &vop_create_desc, procfs_create },            /* create */
 252         { &vop_mknod_desc, procfs_mknod },              /* mknod */
 253         { &vop_open_desc, procfs_open },                /* open */
 254         { &vop_close_desc, procfs_close },              /* close */
 255         { &vop_access_desc, procfs_access },            /* access */
 256         { &vop_getattr_desc, procfs_getattr },          /* getattr */
 257         { &vop_setattr_desc, procfs_setattr },          /* setattr */
 258         { &vop_read_desc, procfs_read },                /* read */
 259         { &vop_write_desc, procfs_write },              /* write */
 260         { &vop_fcntl_desc, procfs_fcntl },              /* fcntl */
 261         { &vop_ioctl_desc, procfs_ioctl },              /* ioctl */
 262         { &vop_poll_desc, procfs_poll },                /* poll */
 263         { &vop_revoke_desc, procfs_revoke },            /* revoke */
 264         { &vop_fsync_desc, procfs_fsync },              /* fsync */
 265         { &vop_seek_desc, procfs_seek },                /* seek */
 266         { &vop_remove_desc, procfs_remove },            /* remove */
 267         { &vop_link_desc, procfs_link },                /* link */
 268         { &vop_rename_desc, procfs_rename },            /* rename */
 269         { &vop_mkdir_desc, procfs_mkdir },              /* mkdir */
 270         { &vop_rmdir_desc, procfs_rmdir },              /* rmdir */
 271         { &vop_symlink_desc, procfs_symlink },          /* symlink */
 272         { &vop_readdir_desc, procfs_readdir },          /* readdir */
 273         { &vop_readlink_desc, procfs_readlink },        /* readlink */
 274         { &vop_abortop_desc, procfs_abortop },          /* abortop */
 275         { &vop_inactive_desc, procfs_inactive },        /* inactive */
 276         { &vop_reclaim_desc, procfs_reclaim },          /* reclaim */
 277         { &vop_lock_desc, procfs_lock },                /* lock */
 278         { &vop_unlock_desc, procfs_unlock },            /* unlock */
 279         { &vop_bmap_desc, procfs_bmap },                /* bmap */
 280         { &vop_strategy_desc, procfs_strategy },        /* strategy */
 281         { &vop_print_desc, procfs_print },              /* print */
 282         { &vop_islocked_desc, procfs_islocked },        /* islocked */
 283         { &vop_pathconf_desc, procfs_pathconf },        /* pathconf */
 284         { &vop_advlock_desc, procfs_advlock },          /* advlock */
 285         { &vop_putpages_desc, procfs_putpages },        /* putpages */
 286         { NULL, NULL }
 287 };
 288 const struct vnodeopv_desc procfs_vnodeop_opv_desc =
 289         { &procfs_vnodeop_p, procfs_vnodeop_entries };
 290 /*
 291  * set things up for doing i/o on
 292  * the pfsnode (vp).  (vp) is locked
 293  * on entry, and should be left locked
 294  * on exit.
 295  *
 296  * for procfs we don't need to do anything
 297  * in particular for i/o.  all that is done
 298  * is to support exclusive open on process
 299  * memory images.
 300  */
 301 int
 302 procfs_open(void *v)
 303 {
 304         struct vop_open_args /* {
 305                 struct vnode *a_vp;
 306                 int  a_mode;
 307                 kauth_cred_t a_cred;
 308         } */ *ap = v;
 309         struct pfsnode *pfs = VTOPFS(ap->a_vp);
 310         struct lwp *l1;
 311         struct proc *p2;
 312         int error;
 313
 314         if ((error = procfs_proc_lock(pfs->pfs_pid, &p2, ENOENT)) != 0)
 315                 return error;
 316
 317         l1 = curlwp;                            /* tracer */
 318
 319 #define M2K(m)  (((m) & FREAD) && ((m) & FWRITE) ? \
 320                  KAUTH_REQ_PROCESS_PROCFS_RW : \
 321                  (m) & FWRITE ? KAUTH_REQ_PROCESS_PROCFS_WRITE : \
 322                  KAUTH_REQ_PROCESS_PROCFS_READ)
 323
 324         mutex_enter(p2->p_lock);
 325         error = kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_PROCFS,
 326             p2, pfs, KAUTH_ARG(M2K(ap->a_mode)), NULL);
 327         mutex_exit(p2->p_lock);
 328         if (error) {
 329                 procfs_proc_unlock(p2);
 330                 return (error);
 331         }
 332
 333 #undef M2K
 334
 335         switch (pfs->pfs_type) {
 336         case PFSmem:
 337                 if (((pfs->pfs_flags & FWRITE) && (ap->a_mode & O_EXCL)) ||
 338                     ((pfs->pfs_flags & O_EXCL) && (ap->a_mode & FWRITE))) {
 339                         error = EBUSY;
 340                         break;
 341                 }
 342
 343                 if (!proc_isunder(p2, l1)) {
 344                         error = EPERM;
 345                         break;
 346                 }
 347
 348                 if (ap->a_mode & FWRITE)
 349                         pfs->pfs_flags = ap->a_mode & (FWRITE|O_EXCL);
 350
 351                 break;
 352
 353         case PFSregs:
 354         case PFSfpregs:
 355                 if (!proc_isunder(p2, l1)) {
 356                         error = EPERM;
 357                         break;
 358                 }
 359                 break;
 360
 361         default:
 362                 break;
 363         }
 364
 365         procfs_proc_unlock(p2);
 366         return (error);
 367 }
 368
 369 /*
 370  * close the pfsnode (vp) after doing i/o.
 371  * (vp) is not locked on entry or exit.
 372  *
 373  * nothing to do for procfs other than undo
 374  * any exclusive open flag (see _open above).
 375  */
 376 int
 377 procfs_close(void *v)
 378 {
 379         struct vop_close_args /* {
 380                 struct vnode *a_vp;
 381                 int  a_fflag;
 382                 kauth_cred_t a_cred;
 383         } */ *ap = v;
 384         struct pfsnode *pfs = VTOPFS(ap->a_vp);
 385
 386         switch (pfs->pfs_type) {
 387         case PFSmem:
 388                 if ((ap->a_fflag & FWRITE) && (pfs->pfs_flags & O_EXCL))
 389                         pfs->pfs_flags &= ~(FWRITE|O_EXCL);
 390                 break;
 391
 392         default:
 393                 break;
 394         }
 395
 396         return (0);
 397 }
 398
 399 /*
 400  * _inactive is called when the pfsnode
 401  * is vrele'd and the reference count goes
 402  * to zero.  (vp) will be on the vnode free
 403  * list, so to get it back vget() must be
 404  * used.
 405  *
 406  * (vp) is locked on entry, but must be unlocked on exit.
 407  */
 408 int
 409 procfs_inactive(void *v)
 410 {
 411         struct vop_inactive_args /* {
 412                 struct vnode *a_vp;
 413                 bool *a_recycle;
 414         } */ *ap = v;
 415         struct vnode *vp = ap->a_vp;
 416         struct pfsnode *pfs = VTOPFS(vp);
 417
 418         mutex_enter(proc_lock);
 419         *ap->a_recycle = (p_find(pfs->pfs_pid, PFIND_LOCKED) == NULL);
 420         mutex_exit(proc_lock);
 421
 422         VOP_UNLOCK(vp, 0);
 423
 424         return (0);
 425 }
 426
 427 /*
 428  * _reclaim is called when getnewvnode()
 429  * wants to make use of an entry on the vnode
 430  * free list.  at this time the filesystem needs
 431  * to free any private data and remove the node
 432  * from any private lists.
 433  */
 434 int
 435 procfs_reclaim(void *v)
 436 {
 437         struct vop_reclaim_args /* {
 438                 struct vnode *a_vp;
 439         } */ *ap = v;
 440
 441         return (procfs_freevp(ap->a_vp));
 442 }
 443
 444 /*
 445  * Return POSIX pathconf information applicable to special devices.
 446  */
 447 int
 448 procfs_pathconf(void *v)
 449 {
 450         struct vop_pathconf_args /* {
 451                 struct vnode *a_vp;
 452                 int a_name;
 453                 register_t *a_retval;
 454         } */ *ap = v;
 455
 456         switch (ap->a_name) {
 457         case _PC_LINK_MAX:
 458                 *ap->a_retval = LINK_MAX;
 459                 return (0);
 460         case _PC_MAX_CANON:
 461                 *ap->a_retval = MAX_CANON;
 462                 return (0);
 463         case _PC_MAX_INPUT:
 464                 *ap->a_retval = MAX_INPUT;
 465                 return (0);
 466         case _PC_PIPE_BUF:
 467                 *ap->a_retval = PIPE_BUF;
 468                 return (0);
 469         case _PC_CHOWN_RESTRICTED:
 470                 *ap->a_retval = 1;
 471                 return (0);
 472         case _PC_VDISABLE:
 473                 *ap->a_retval = _POSIX_VDISABLE;
 474                 return (0);
 475         case _PC_SYNC_IO:
 476                 *ap->a_retval = 1;
 477                 return (0);
 478         default:
 479                 return (EINVAL);
 480         }
 481         /* NOTREACHED */
 482 }
 483
 484 /*
 485  * _print is used for debugging.
 486  * just print a readable description
 487  * of (vp).
 488  */
 489 int
 490 procfs_print(void *v)
 491 {
 492         struct vop_print_args /* {
 493                 struct vnode *a_vp;
 494         } */ *ap = v;
 495         struct pfsnode *pfs = VTOPFS(ap->a_vp);
 496
 497         printf("tag VT_PROCFS, type %d, pid %d, mode %x, flags %lx\n",
 498             pfs->pfs_type, pfs->pfs_pid, pfs->pfs_mode, pfs->pfs_flags);
 499         return 0;
 500 }
 501
 502 int
 503 procfs_link(void *v)
 504 {
 505         struct vop_link_args /* {
 506                 struct vnode *a_dvp;
 507                 struct vnode *a_vp;
 508                 struct componentname *a_cnp;
 509         } */ *ap = v;
 510
 511         VOP_ABORTOP(ap->a_dvp, ap->a_cnp);
 512         vput(ap->a_dvp);
 513         return (EROFS);
 514 }
 515
 516 int
 517 procfs_symlink(void *v)
 518 {
 519         struct vop_symlink_args /* {
 520                 struct vnode *a_dvp;
 521                 struct vnode **a_vpp;
 522                 struct componentname *a_cnp;
 523                 struct vattr *a_vap;
 524                 char *a_target;
 525         } */ *ap = v;
 526
 527         VOP_ABORTOP(ap->a_dvp, ap->a_cnp);
 528         vput(ap->a_dvp);
 529         return (EROFS);
 530 }
 531
 532 /*
 533  * Works out the path to (and vnode of) the target process's current
 534  * working directory or chroot.  If the caller is in a chroot and
 535  * can't "reach" the target's cwd or root (or some other error
 536  * occurs), a "/" is returned for the path and a NULL pointer is
 537  * returned for the vnode.
 538  */
 539 static void
 540 procfs_dir(pfstype t, struct lwp *caller, struct proc *target, char **bpp,
 541     char *path, size_t len)
 542 {
 543         struct cwdinfo *cwdi;
 544         struct vnode *vp, *rvp;
 545         char *bp;
 546
 547         cwdi = caller->l_proc->p_cwdi;
 548         rw_enter(&cwdi->cwdi_lock, RW_READER);
 549
 550         rvp = cwdi->cwdi_rdir;
 551         bp = bpp ? *bpp : NULL;
 552
 553         switch (t) {
 554         case PFScwd:
 555                 vp = target->p_cwdi->cwdi_cdir;
 556                 break;
 557         case PFSchroot:
 558                 vp = target->p_cwdi->cwdi_rdir;
 559                 break;
 560         case PFSexe:
 561                 vp = target->p_textvp;
 562                 break;
 563         default:
 564                 rw_exit(&cwdi->cwdi_lock);
 565                 return;
 566         }
 567
 568         /*
 569          * XXX: this horrible kludge avoids locking panics when
 570          * attempting to lookup links that point to within procfs
 571          */
 572         if (vp != NULL && vp->v_tag == VT_PROCFS) {
 573                 if (bpp) {
 574                         *--bp = '/';
 575                         *bpp = bp;
 576                 }
 577                 rw_exit(&cwdi->cwdi_lock);
 578                 return;
 579         }
 580
 581         if (rvp == NULL)
 582                 rvp = rootvnode;
 583         if (vp == NULL || getcwd_common(vp, rvp, bp ? &bp : NULL, path,
 584             len / 2, 0, caller) != 0) {
 585                 vp = NULL;
 586                 if (bpp) {
 587 /*
 588                         if (t == PFSexe) {
 589                                 snprintf(path, len, "%s/%d/file"
 590                                     mp->mnt_stat.f_mntonname, pfs->pfs_pid);
 591                         } else */ {
 592                                 bp = *bpp;
 593                                 *--bp = '/';
 594                         }
 595                 }
 596         }
 597
 598         if (bpp)
 599                 *bpp = bp;
 600
 601         rw_exit(&cwdi->cwdi_lock);
 602 }
 603
 604 /*
 605  * Invent attributes for pfsnode (vp) and store
 606  * them in (vap).
 607  * Directories lengths are returned as zero since
 608  * any real length would require the genuine size
 609  * to be computed, and nothing cares anyway.
 610  *
 611  * this is relatively minimal for procfs.
 612  */
 613 int
 614 procfs_getattr(void *v)
 615 {
 616         struct vop_getattr_args /* {
 617                 struct vnode *a_vp;
 618                 struct vattr *a_vap;
 619                 kauth_cred_t a_cred;
 620         } */ *ap = v;
 621         struct pfsnode *pfs = VTOPFS(ap->a_vp);
 622         struct vattr *vap = ap->a_vap;
 623         struct proc *procp;
 624         char *path;
 625         int error;
 626
 627         /* first check the process still exists */
 628         switch (pfs->pfs_type) {
 629         case PFSroot:
 630         case PFScurproc:
 631         case PFSself:
 632                 procp = NULL;
 633                 break;
 634
 635         default:
 636                 error = procfs_proc_lock(pfs->pfs_pid, &procp, ENOENT);
 637                 if (error != 0)
 638                         return (error);
 639                 break;
 640         }
 641
 642         switch (pfs->pfs_type) {
 643         case PFScwd:
 644         case PFSchroot:
 645         case PFSexe:
 646                 path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK|M_CANFAIL);
 647                 if (path == NULL && procp != NULL) {
 648                         procfs_proc_unlock(procp);
 649                         return (ENOMEM);
 650                 }
 651                 break;
 652
 653         default:
 654                 path = NULL;
 655                 break;
 656         }
 657
 658         if (procp != NULL) {
 659                 mutex_enter(procp->p_lock);
 660                 error = kauth_authorize_process(kauth_cred_get(),
 661                     KAUTH_PROCESS_CANSEE, procp,
 662                     KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
 663                 mutex_exit(procp->p_lock);
 664                 if (error != 0) {
 665                         procfs_proc_unlock(procp);
 666                         if (path != NULL)
 667                                 free(path, M_TEMP);
 668                         return (ENOENT);
 669                 }
 670         }
 671
 672         error = 0;
 673
 674         /* start by zeroing out the attributes */
 675         vattr_null(vap);
 676
 677         /* next do all the common fields */
 678         vap->va_type = ap->a_vp->v_type;
 679         vap->va_mode = pfs->pfs_mode;
 680         vap->va_fileid = pfs->pfs_fileno;
 681         vap->va_flags = 0;
 682         vap->va_blocksize = PAGE_SIZE;
 683
 684         /*
 685          * Make all times be current TOD.
 686          *
 687          * It would be possible to get the process start
 688          * time from the p_stats structure, but there's
 689          * no "file creation" time stamp anyway, and the
 690          * p_stats structure is not addressable if u. gets
 691          * swapped out for that process.
 692          */
 693         getnanotime(&vap->va_ctime);
 694         vap->va_atime = vap->va_mtime = vap->va_ctime;
 695         if (procp)
 696                 TIMEVAL_TO_TIMESPEC(&procp->p_stats->p_start,
 697                     &vap->va_birthtime);
 698         else
 699                 getnanotime(&vap->va_birthtime);
 700
 701         switch (pfs->pfs_type) {
 702         case PFSmem:
 703         case PFSregs:
 704         case PFSfpregs:
 705 #if defined(__HAVE_PROCFS_MACHDEP) && defined(PROCFS_MACHDEP_PROTECT_CASES)
 706         PROCFS_MACHDEP_PROTECT_CASES
 707 #endif
 708                 /*
 709                  * If the process has exercised some setuid or setgid
 710                  * privilege, then rip away read/write permission so
 711                  * that only root can gain access.
 712                  */
 713                 if (procp->p_flag & PK_SUGID)
 714                         vap->va_mode &= ~(S_IRUSR|S_IWUSR);
 715                 /* FALLTHROUGH */
 716         case PFSctl:
 717         case PFSstatus:
 718         case PFSstat:
 719         case PFSnote:
 720         case PFSnotepg:
 721         case PFSmap:
 722         case PFSmaps:
 723         case PFScmdline:
 724         case PFSemul:
 725         case PFSstatm:
 726                 vap->va_nlink = 1;
 727                 vap->va_uid = kauth_cred_geteuid(procp->p_cred);
 728                 vap->va_gid = kauth_cred_getegid(procp->p_cred);
 729                 break;
 730         case PFSmeminfo:
 731         case PFSdevices:
 732         case PFScpuinfo:
 733         case PFSuptime:
 734         case PFSmounts:
 735         case PFScpustat:
 736         case PFSloadavg:
 737                 vap->va_nlink = 1;
 738                 vap->va_uid = vap->va_gid = 0;
 739                 break;
 740
 741         default:
 742                 break;
 743         }
 744
 745         /*
 746          * now do the object specific fields
 747          *
 748          * The size could be set from struct reg, but it's hardly
 749          * worth the trouble, and it puts some (potentially) machine
 750          * dependent data into this machine-independent code.  If it
 751          * becomes important then this function should break out into
 752          * a per-file stat function in the corresponding .c file.
 753          */
 754
 755         switch (pfs->pfs_type) {
 756         case PFSroot:
 757                 /*
 758                  * Set nlink to 1 to tell fts(3) we don't actually know.
 759                  */
 760                 vap->va_nlink = 1;
 761                 vap->va_uid = 0;
 762                 vap->va_gid = 0;
 763                 vap->va_bytes = vap->va_size = DEV_BSIZE;
 764                 break;
 765
 766         case PFSself:
 767         case PFScurproc: {
 768                 char bf[16];            /* should be enough */
 769                 vap->va_nlink = 1;
 770                 vap->va_uid = 0;
 771                 vap->va_gid = 0;
 772                 vap->va_bytes = vap->va_size =
 773                     snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid);
 774                 break;
 775         }
 776
 777         case PFSfd:
 778                 if (pfs->pfs_fd != -1) {
 779                         file_t *fp;
 780
 781                         fp = fd_getfile2(procp, pfs->pfs_fd);
 782                         if (fp == NULL) {
 783                                 error = EBADF;
 784                                 break;
 785                         }
 786                         vap->va_nlink = 1;
 787                         vap->va_uid = kauth_cred_geteuid(fp->f_cred);
 788                         vap->va_gid = kauth_cred_getegid(fp->f_cred);
 789                         switch (fp->f_type) {
 790                         case DTYPE_VNODE:
 791                                 vap->va_bytes = vap->va_size =
 792                                     ((struct vnode *)fp->f_data)->v_size;
 793                                 break;
 794                         default:
 795                                 vap->va_bytes = vap->va_size = 0;
 796                                 break;
 797                         }
 798                         closef(fp);
 799                         break;
 800                 }
 801                 /*FALLTHROUGH*/
 802         case PFSproc:
 803                 vap->va_nlink = 2;
 804                 vap->va_uid = kauth_cred_geteuid(procp->p_cred);
 805                 vap->va_gid = kauth_cred_getegid(procp->p_cred);
 806                 vap->va_bytes = vap->va_size = DEV_BSIZE;
 807                 break;
 808
 809         case PFSfile:
 810                 error = EOPNOTSUPP;
 811                 break;
 812
 813         case PFSmem:
 814                 vap->va_bytes = vap->va_size =
 815                         ctob(procp->p_vmspace->vm_tsize +
 816                                     procp->p_vmspace->vm_dsize +
 817                                     procp->p_vmspace->vm_ssize);
 818                 break;
 819
 820 #if defined(PT_GETREGS) || defined(PT_SETREGS)
 821         case PFSregs:
 822                 vap->va_bytes = vap->va_size = sizeof(struct reg);
 823                 break;
 824 #endif
 825
 826 #if defined(PT_GETFPREGS) || defined(PT_SETFPREGS)
 827         case PFSfpregs:
 828                 vap->va_bytes = vap->va_size = sizeof(struct fpreg);
 829                 break;
 830 #endif
 831
 832         case PFSctl:
 833         case PFSstatus:
 834         case PFSstat:
 835         case PFSnote:
 836         case PFSnotepg:
 837         case PFScmdline:
 838         case PFSmeminfo:
 839         case PFSdevices:
 840         case PFScpuinfo:
 841         case PFSuptime:
 842         case PFSmounts:
 843         case PFScpustat:
 844         case PFSloadavg:
 845         case PFSstatm:
 846                 vap->va_bytes = vap->va_size = 0;
 847                 break;
 848         case PFSmap:
 849         case PFSmaps:
 850                 /*
 851                  * Advise a larger blocksize for the map files, so that
 852                  * they may be read in one pass.
 853                  */
 854                 vap->va_blocksize = 4 * PAGE_SIZE;
 855                 vap->va_bytes = vap->va_size = 0;
 856                 break;
 857
 858         case PFScwd:
 859         case PFSchroot:
 860         case PFSexe: {
 861                 char *bp;
 862
 863                 vap->va_nlink = 1;
 864                 vap->va_uid = 0;
 865                 vap->va_gid = 0;
 866                 bp = path + MAXPATHLEN;
 867                 *--bp = '\0';
 868                 procfs_dir(pfs->pfs_type, curlwp, procp, &bp, path,
 869                      MAXPATHLEN);
 870                 vap->va_bytes = vap->va_size = strlen(bp);
 871                 break;
 872         }
 873
 874         case PFSemul:
 875                 vap->va_bytes = vap->va_size = strlen(procp->p_emul->e_name);
 876                 break;
 877
 878 #ifdef __HAVE_PROCFS_MACHDEP
 879         PROCFS_MACHDEP_NODETYPE_CASES
 880                 error = procfs_machdep_getattr(ap->a_vp, vap, procp);
 881                 break;
 882 #endif
 883
 884         default:
 885                 panic("procfs_getattr");
 886         }
 887
 888         if (procp != NULL)
 889                 procfs_proc_unlock(procp);
 890         if (path != NULL)
 891                 free(path, M_TEMP);
 892
 893         return (error);
 894 }
 895
 896 /*ARGSUSED*/
 897 int
 898 procfs_setattr(void *v)
 899 {
 900         /*
 901          * just fake out attribute setting
 902          * it's not good to generate an error
 903          * return, otherwise things like creat()
 904          * will fail when they try to set the
 905          * file length to 0.  worse, this means
 906          * that echo $note > /proc/$pid/note will fail.
 907          */
 908
 909         return (0);
 910 }
 911
 912 static int
 913 procfs_check_possible(struct vnode *vp, mode_t mode)
 914 {
 915
 916         return 0;
 917 }
 918
 919 static int
 920 procfs_check_permitted(struct vattr *va, mode_t mode, kauth_cred_t cred)
 921 {
 922
 923         return genfs_can_access(va->va_type, va->va_mode,
 924             va->va_uid, va->va_gid, mode, cred);
 925 }
 926
 927 /*
 928  * implement access checking.
 929  *
 930  * actually, the check for super-user is slightly
 931  * broken since it will allow read access to write-only
 932  * objects.  this doesn't cause any particular trouble
 933  * but does mean that the i/o entry points need to check
 934  * that the operation really does make sense.
 935  */
 936 int
 937 procfs_access(void *v)
 938 {
 939         struct vop_access_args /* {
 940                 struct vnode *a_vp;
 941                 int a_mode;
 942                 kauth_cred_t a_cred;
 943         } */ *ap = v;
 944         struct vattr va;
 945         int error;
 946
 947         if ((error = VOP_GETATTR(ap->a_vp, &va, ap->a_cred)) != 0)
 948                 return (error);
 949
 950         error = procfs_check_possible(ap->a_vp, ap->a_mode);
 951         if (error)
 952                 return error;
 953
 954         error = procfs_check_permitted(&va, ap->a_mode, ap->a_cred);
 955
 956         return error;
 957 }
 958
 959 /*
 960  * lookup.  this is incredibly complicated in the
 961  * general case, however for most pseudo-filesystems
 962  * very little needs to be done.
 963  *
 964  * Locking isn't hard here, just poorly documented.
 965  *
 966  * If we're looking up ".", just vref the parent & return it.
 967  *
 968  * If we're looking up "..", unlock the parent, and lock "..". If everything
 969  * went ok, and we're on the last component and the caller requested the
 970  * parent locked, try to re-lock the parent. We do this to prevent lock
 971  * races.
 972  *
 973  * For anything else, get the needed node. Then unlock the parent if not
 974  * the last component or not LOCKPARENT (i.e. if we wouldn't re-lock the
 975  * parent in the .. case).
 976  *
 977  * We try to exit with the parent locked in error cases.
 978  */
 979 int
 980 procfs_lookup(void *v)
 981 {
 982         struct vop_lookup_args /* {
 983                 struct vnode * a_dvp;
 984                 struct vnode ** a_vpp;
 985                 struct componentname * a_cnp;
 986         } */ *ap = v;
 987         struct componentname *cnp = ap->a_cnp;
 988         struct vnode **vpp = ap->a_vpp;
 989         struct vnode *dvp = ap->a_dvp;
 990         const char *pname = cnp->cn_nameptr;
 991         const struct proc_target *pt = NULL;
 992         struct vnode *fvp;
 993         pid_t pid, vnpid;
 994         struct pfsnode *pfs;
 995         struct proc *p = NULL;
 996         struct lwp *plwp;
 997         int i, error;
 998         pfstype type;
 999
1000         *vpp = NULL;
1001
1002         if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
1003                 return (EROFS);
1004
1005         if (cnp->cn_namelen == 1 && *pname == '.') {
1006                 *vpp = dvp;
1007                 vref(dvp);
1008                 return (0);
1009         }
1010
1011         pfs = VTOPFS(dvp);
1012         switch (pfs->pfs_type) {
1013         case PFSroot:
1014                 /*
1015                  * Shouldn't get here with .. in the root node.
1016                  */
1017                 if (cnp->cn_flags & ISDOTDOT)
1018                         return (EIO);
1019
1020                 for (i = 0; i < nproc_root_targets; i++) {
1021                         pt = &proc_root_targets[i];
1022                         /*
1023                          * check for node match.  proc is always NULL here,
1024                          * so call pt_valid with constant NULL lwp.
1025                          */
1026                         if (cnp->cn_namelen == pt->pt_namlen &&
1027                             memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
1028                             (pt->pt_valid == NULL ||
1029                              (*pt->pt_valid)(NULL, dvp->v_mount)))
1030                                 break;
1031                 }
1032
1033                 if (i != nproc_root_targets) {
1034                         error = procfs_allocvp(dvp->v_mount, vpp, 0,
1035                             pt->pt_pfstype, -1, NULL);
1036                         return (error);
1037                 }
1038
1039                 if (CNEQ(cnp, "curproc", 7)) {
1040                         pid = curproc->p_pid;
1041                         vnpid = 0;
1042                         type = PFScurproc;
1043                 } else if (CNEQ(cnp, "self", 4)) {
1044                         pid = curproc->p_pid;
1045                         vnpid = 0;
1046                         type = PFSself;
1047                 } else {
1048                         pid = (pid_t)atoi(pname, cnp->cn_namelen);
1049                         vnpid = pid;
1050                         type = PFSproc;
1051                 }
1052
1053                 if (procfs_proc_lock(pid, &p, ESRCH) != 0)
1054                         break;
1055                 error = procfs_allocvp(dvp->v_mount, vpp, vnpid, type, -1, p);
1056                 procfs_proc_unlock(p);
1057                 return (error);
1058
1059         case PFSproc:
1060                 /*
1061                  * do the .. dance. We unlock the directory, and then
1062                  * get the root dir. That will automatically return ..
1063                  * locked. Then if the caller wanted dvp locked, we
1064                  * re-lock.
1065                  */
1066                 if (cnp->cn_flags & ISDOTDOT) {
1067                         VOP_UNLOCK(dvp, 0);
1068                         error = procfs_root(dvp->v_mount, vpp);
1069                         vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
1070                         return (error);
1071                 }
1072
1073                 if (procfs_proc_lock(pfs->pfs_pid, &p, ESRCH) != 0)
1074                         break;
1075
1076                 mutex_enter(p->p_lock);
1077                 LIST_FOREACH(plwp, &p->p_lwps, l_sibling) {
1078                         if (plwp->l_stat != LSZOMB)
1079                                 break;
1080                 }
1081                 /* Process is exiting if no-LWPS or all LWPs are LSZOMB */
1082                 if (plwp == NULL) {
1083                         mutex_exit(p->p_lock);
1084                         procfs_proc_unlock(p);
1085                         return ESRCH;
1086                 }
1087
1088                 lwp_addref(plwp);
1089                 mutex_exit(p->p_lock);
1090
1091                 for (pt = proc_targets, i = 0; i < nproc_targets; pt++, i++) {
1092                         int found;
1093
1094                         found = cnp->cn_namelen == pt->pt_namlen &&
1095                             memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
1096                             (pt->pt_valid == NULL
1097                               || (*pt->pt_valid)(plwp, dvp->v_mount));
1098                         if (found)
1099                                 break;
1100                 }
1101                 lwp_delref(plwp);
1102
1103                 if (i == nproc_targets) {
1104                         procfs_proc_unlock(p);
1105                         break;
1106                 }
1107                 if (pt->pt_pfstype == PFSfile) {
1108                         fvp = p->p_textvp;
1109                         /* We already checked that it exists. */
1110                         vref(fvp);
1111                         procfs_proc_unlock(p);
1112                         vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);
1113                         *vpp = fvp;
1114                         return (0);
1115                 }
1116
1117                 error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
1118                     pt->pt_pfstype, -1, p);
1119                 procfs_proc_unlock(p);
1120                 return (error);
1121
1122         case PFSfd: {
1123                 int fd;
1124                 file_t *fp;
1125
1126                 if ((error = procfs_proc_lock(pfs->pfs_pid, &p, ENOENT)) != 0)
1127                         return error;
1128
1129                 /*
1130                  * do the .. dance. We unlock the directory, and then
1131                  * get the proc dir. That will automatically return ..
1132                  * locked. Then re-lock the directory.
1133                  */
1134                 if (cnp->cn_flags & ISDOTDOT) {
1135                         VOP_UNLOCK(dvp, 0);
1136                         error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
1137                             PFSproc, -1, p);
1138                         procfs_proc_unlock(p);
1139                         vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
1140                         return (error);
1141                 }
1142                 fd = atoi(pname, cnp->cn_namelen);
1143
1144                 fp = fd_getfile2(p, fd);
1145                 if (fp == NULL) {
1146                         procfs_proc_unlock(p);
1147                         return ENOENT;
1148                 }
1149                 fvp = fp->f_data;
1150
1151                 /* Don't show directories */
1152                 if (fp->f_type == DTYPE_VNODE && fvp->v_type != VDIR) {
1153                         vref(fvp);
1154                         closef(fp);
1155                         procfs_proc_unlock(p);
1156                         vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY |
1157                             (p == curproc ? LK_CANRECURSE : 0));
1158                         *vpp = fvp;
1159                         return 0;
1160                 }
1161
1162                 closef(fp);
1163                 error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
1164                     PFSfd, fd, p);
1165                 procfs_proc_unlock(p);
1166                 return error;
1167         }
1168         default:
1169                 return (ENOTDIR);
1170         }
1171
1172         return (cnp->cn_nameiop == LOOKUP ? ENOENT : EROFS);
1173 }
1174
1175 int
1176 procfs_validfile(struct lwp *l, struct mount *mp)
1177 {
1178         return l != NULL && l->l_proc != NULL && l->l_proc->p_textvp != NULL;
1179 }
1180
1181 static int
1182 procfs_validfile_linux(struct lwp *l, struct mount *mp)
1183 {
1184         int flags;
1185
1186         flags = VFSTOPROC(mp)->pmnt_flags;
1187         return (flags & PROCFSMNT_LINUXCOMPAT) &&
1188             (l == NULL || l->l_proc == NULL || procfs_validfile(l, mp));
1189 }
1190
1191 struct procfs_root_readdir_ctx {
1192         struct uio *uiop;
1193         off_t *cookies;
1194         int ncookies;
1195         off_t off;
1196         off_t startoff;
1197         int error;
1198 };
1199
1200 static int
1201 procfs_root_readdir_callback(struct proc *p, void *arg)
1202 {
1203         struct procfs_root_readdir_ctx *ctxp = arg;
1204         struct dirent d;
1205         struct uio *uiop;
1206         int error;
1207
1208         uiop = ctxp->uiop;
1209         if (uiop->uio_resid < UIO_MX)
1210                 return -1; /* no space */
1211
1212         if (ctxp->off < ctxp->startoff) {
1213                 ctxp->off++;
1214                 return 0;
1215         }
1216
1217         if (kauth_authorize_process(kauth_cred_get(),
1218             KAUTH_PROCESS_CANSEE, p,
1219             KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL) != 0)
1220                 return 0;
1221
1222         memset(&d, 0, UIO_MX);
1223         d.d_reclen = UIO_MX;
1224         d.d_fileno = PROCFS_FILENO(p->p_pid, PFSproc, -1);
1225         d.d_namlen = snprintf(d.d_name,
1226             UIO_MX - offsetof(struct dirent, d_name), "%ld", (long)p->p_pid);
1227         d.d_type = DT_DIR;
1228
1229         mutex_exit(proc_lock);
1230         error = uiomove(&d, UIO_MX, uiop);
1231         mutex_enter(proc_lock);
1232         if (error) {
1233                 ctxp->error = error;
1234                 return -1;
1235         }
1236
1237         ctxp->ncookies++;
1238         if (ctxp->cookies)
1239                 *(ctxp->cookies)++ = ctxp->off + 1;
1240         ctxp->off++;
1241
1242         return 0;
1243 }
1244
1245 /*
1246  * readdir returns directory entries from pfsnode (vp).
1247  *
1248  * the strategy here with procfs is to generate a single
1249  * directory entry at a time (struct dirent) and then
1250  * copy that out to userland using uiomove.  a more efficent
1251  * though more complex implementation, would try to minimize
1252  * the number of calls to uiomove().  for procfs, this is
1253  * hardly worth the added code complexity.
1254  *
1255  * this should just be done through read()
1256  */
1257 int
1258 procfs_readdir(void *v)
1259 {
1260         struct vop_readdir_args /* {
1261                 struct vnode *a_vp;
1262                 struct uio *a_uio;
1263                 kauth_cred_t a_cred;
1264                 int *a_eofflag;
1265                 off_t **a_cookies;
1266                 int *a_ncookies;
1267         } */ *ap = v;
1268         struct uio *uio = ap->a_uio;
1269         struct dirent d;
1270         struct pfsnode *pfs;
1271         off_t i;
1272         int error;
1273         off_t *cookies = NULL;
1274         int ncookies;
1275         struct vnode *vp;
1276         const struct proc_target *pt;
1277         struct procfs_root_readdir_ctx ctx;
1278         struct lwp *l;
1279         int nfd;
1280
1281         vp = ap->a_vp;
1282         pfs = VTOPFS(vp);
1283
1284         if (uio->uio_resid < UIO_MX)
1285                 return (EINVAL);
1286         if (uio->uio_offset < 0)
1287                 return (EINVAL);
1288
1289         error = 0;
1290         i = uio->uio_offset;
1291         memset(&d, 0, UIO_MX);
1292         d.d_reclen = UIO_MX;
1293         ncookies = uio->uio_resid / UIO_MX;
1294
1295         switch (pfs->pfs_type) {
1296         /*
1297          * this is for the process-specific sub-directories.
1298          * all that is needed to is copy out all the entries
1299          * from the procent[] table (top of this file).
1300          */
1301         case PFSproc: {
1302                 struct proc *p;
1303
1304                 if (i >= nproc_targets)
1305                         return 0;
1306
1307                 if (procfs_proc_lock(pfs->pfs_pid, &p, ESRCH) != 0)
1308                         break;
1309
1310                 if (ap->a_ncookies) {
1311                         ncookies = min(ncookies, (nproc_targets - i));
1312                         cookies = malloc(ncookies * sizeof (off_t),
1313                             M_TEMP, M_WAITOK);
1314                         *ap->a_cookies = cookies;
1315                 }
1316
1317                 for (pt = &proc_targets[i];
1318                      uio->uio_resid >= UIO_MX && i < nproc_targets; pt++, i++) {
1319                         if (pt->pt_valid) {
1320                                 /* XXXSMP LWP can disappear */
1321                                 mutex_enter(p->p_lock);
1322                                 l = LIST_FIRST(&p->p_lwps);
1323                                 KASSERT(l != NULL);
1324                                 mutex_exit(p->p_lock);
1325                                 if ((*pt->pt_valid)(l, vp->v_mount) == 0)
1326                                         continue;
1327                         }
1328
1329                         d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
1330                             pt->pt_pfstype, -1);
1331                         d.d_namlen = pt->pt_namlen;
1332                         memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
1333                         d.d_type = pt->pt_type;
1334
1335                         if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1336                                 break;
1337                         if (cookies)
1338                                 *cookies++ = i + 1;
1339                 }
1340
1341                 procfs_proc_unlock(p);
1342                 break;
1343         }
1344         case PFSfd: {
1345                 struct proc *p;
1346                 file_t *fp;
1347                 int lim, nc = 0;
1348
1349                 if ((error = procfs_proc_lock(pfs->pfs_pid, &p, ESRCH)) != 0)
1350                         return error;
1351
1352                 /* XXX Should this be by file as well? */
1353                 if (kauth_authorize_process(kauth_cred_get(),
1354                     KAUTH_PROCESS_CANSEE, p,
1355                     KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), NULL,
1356                     NULL) != 0) {
1357                         procfs_proc_unlock(p);
1358                         return ESRCH;
1359                 }
1360
1361                 nfd = p->p_fd->fd_dt->dt_nfiles;
1362
1363                 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
1364                 if (i >= lim) {
1365                         procfs_proc_unlock(p);
1366                         return 0;
1367                 }
1368
1369                 if (ap->a_ncookies) {
1370                         ncookies = min(ncookies, (nfd + 2 - i));
1371                         cookies = malloc(ncookies * sizeof (off_t),
1372                             M_TEMP, M_WAITOK);
1373                         *ap->a_cookies = cookies;
1374                 }
1375
1376                 for (; i < 2 && uio->uio_resid >= UIO_MX; i++) {
1377                         pt = &proc_targets[i];
1378                         d.d_namlen = pt->pt_namlen;
1379                         d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
1380                             pt->pt_pfstype, -1);
1381                         (void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
1382                         d.d_type = pt->pt_type;
1383                         if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1384                                 break;
1385                         if (cookies)
1386                                 *cookies++ = i + 1;
1387                         nc++;
1388                 }
1389                 if (error) {
1390                         ncookies = nc;
1391                         break;
1392                 }
1393                 for (; uio->uio_resid >= UIO_MX && i < nfd; i++) {
1394                         /* check the descriptor exists */
1395                         if ((fp = fd_getfile2(p, i - 2)) == NULL)
1396                                 continue;
1397                         closef(fp);
1398
1399                         d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFSfd, i - 2);
1400                         d.d_namlen = snprintf(d.d_name, sizeof(d.d_name),
1401                             "%lld", (long long)(i - 2));
1402                         d.d_type = VREG;
1403                         if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1404                                 break;
1405                         if (cookies)
1406                                 *cookies++ = i + 1;
1407                         nc++;
1408                 }
1409                 ncookies = nc;
1410                 procfs_proc_unlock(p);
1411                 break;
1412         }
1413
1414         /*
1415          * this is for the root of the procfs filesystem
1416          * what is needed are special entries for "curproc"
1417          * and "self" followed by an entry for each process
1418          * on allproc.
1419          */
1420
1421         case PFSroot: {
1422                 int nc = 0;
1423
1424                 if (ap->a_ncookies) {
1425                         /*
1426                          * XXX Potentially allocating too much space here,
1427                          * but I'm lazy. This loop needs some work.
1428                          */
1429                         cookies = malloc(ncookies * sizeof (off_t),
1430                             M_TEMP, M_WAITOK);
1431                         *ap->a_cookies = cookies;
1432                 }
1433                 error = 0;
1434                 /* 0 ... 3 are static entries. */
1435                 for (; i <= 3 && uio->uio_resid >= UIO_MX; i++) {
1436                         switch (i) {
1437                         case 0:         /* `.' */
1438                         case 1:         /* `..' */
1439                                 d.d_fileno = PROCFS_FILENO(0, PFSroot, -1);
1440                                 d.d_namlen = i + 1;
1441                                 memcpy(d.d_name, "..", d.d_namlen);
1442                                 d.d_name[i + 1] = '\0';
1443                                 d.d_type = DT_DIR;
1444                                 break;
1445
1446                         case 2:
1447                                 d.d_fileno = PROCFS_FILENO(0, PFScurproc, -1);
1448                                 d.d_namlen = sizeof("curproc") - 1;
1449                                 memcpy(d.d_name, "curproc", sizeof("curproc"));
1450                                 d.d_type = DT_LNK;
1451                                 break;
1452
1453                         case 3:
1454                                 d.d_fileno = PROCFS_FILENO(0, PFSself, -1);
1455                                 d.d_namlen = sizeof("self") - 1;
1456                                 memcpy(d.d_name, "self", sizeof("self"));
1457                                 d.d_type = DT_LNK;
1458                                 break;
1459                         }
1460
1461                         if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1462                                 break;
1463                         nc++;
1464                         if (cookies)
1465                                 *cookies++ = i + 1;
1466                 }
1467                 /* 4 ... are process entries. */
1468                 ctx.uiop = uio;
1469                 ctx.error = 0;
1470                 ctx.off = 4;
1471                 ctx.startoff = i;
1472                 ctx.cookies = cookies;
1473                 ctx.ncookies = nc;
1474                 proclist_foreach_call(&allproc,
1475                     procfs_root_readdir_callback, &ctx);
1476                 cookies = ctx.cookies;
1477                 nc = ctx.ncookies;
1478                 error = ctx.error;
1479                 if (error)
1480                         break;
1481
1482                 /* misc entries. */
1483                 if (i < ctx.off)
1484                         i = ctx.off;
1485                 if (i >= ctx.off + nproc_root_targets)
1486                         break;
1487                 for (pt = &proc_root_targets[i - ctx.off];
1488                     uio->uio_resid >= UIO_MX &&
1489                     pt < &proc_root_targets[nproc_root_targets];
1490                     pt++, i++) {
1491                         if (pt->pt_valid &&
1492                             (*pt->pt_valid)(NULL, vp->v_mount) == 0)
1493                                 continue;
1494                         d.d_fileno = PROCFS_FILENO(0, pt->pt_pfstype, -1);
1495                         d.d_namlen = pt->pt_namlen;
1496                         memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
1497                         d.d_type = pt->pt_type;
1498
1499                         if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1500                                 break;
1501                         nc++;
1502                         if (cookies)
1503                                 *cookies++ = i + 1;
1504                 }
1505
1506                 ncookies = nc;
1507                 break;
1508         }
1509
1510         default:
1511                 error = ENOTDIR;
1512                 break;
1513         }
1514
1515         if (ap->a_ncookies) {
1516                 if (error) {
1517                         if (cookies)
1518                                 free(*ap->a_cookies, M_TEMP);
1519                         *ap->a_ncookies = 0;
1520                         *ap->a_cookies = NULL;
1521                 } else
1522                         *ap->a_ncookies = ncookies;
1523         }
1524         uio->uio_offset = i;
1525         return (error);
1526 }
1527
1528 /*
1529  * readlink reads the link of `curproc' and others
1530  */
1531 int
1532 procfs_readlink(void *v)
1533 {
1534         struct vop_readlink_args *ap = v;
1535         char bf[16];            /* should be enough */
1536         char *bp = bf;
1537         char *path = NULL;
1538         int len = 0;
1539         int error = 0;
1540         struct pfsnode *pfs = VTOPFS(ap->a_vp);
1541         struct proc *pown;
1542
1543         if (pfs->pfs_fileno == PROCFS_FILENO(0, PFScurproc, -1))
1544                 len = snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid);
1545         else if (pfs->pfs_fileno == PROCFS_FILENO(0, PFSself, -1))
1546                 len = snprintf(bf, sizeof(bf), "%s", "curproc");
1547         else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFScwd, -1) ||
1548             pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSchroot, -1) ||
1549             pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSexe, -1)) {
1550                 if ((error = procfs_proc_lock(pfs->pfs_pid, &pown, ESRCH)) != 0)
1551                         return error;
1552                 path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK|M_CANFAIL);
1553                 if (path == NULL) {
1554                         procfs_proc_unlock(pown);
1555                         return (ENOMEM);
1556                 }
1557                 bp = path + MAXPATHLEN;
1558                 *--bp = '\0';
1559                 procfs_dir(PROCFS_TYPE(pfs->pfs_fileno), curlwp, pown,
1560                     &bp, path, MAXPATHLEN);
1561                 procfs_proc_unlock(pown);
1562                 len = strlen(bp);
1563         } else {
1564                 file_t *fp;
1565                 struct vnode *vxp, *vp;
1566
1567                 if ((error = procfs_proc_lock(pfs->pfs_pid, &pown, ESRCH)) != 0)
1568                         return error;
1569
1570                 fp = fd_getfile2(pown, pfs->pfs_fd);
1571                 if (fp == NULL) {
1572                         procfs_proc_unlock(pown);
1573                         return EBADF;
1574                 }
1575
1576                 switch (fp->f_type) {
1577                 case DTYPE_VNODE:
1578                         vxp = (struct vnode *)fp->f_data;
1579                         if (vxp->v_type != VDIR) {
1580                                 error = EINVAL;
1581                                 break;
1582                         }
1583                         if ((path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK))
1584                             == NULL) {
1585                                 error = ENOMEM;
1586                                 break;
1587                         }
1588                         bp = path + MAXPATHLEN;
1589                         *--bp = '\0';
1590
1591                         /*
1592                          * XXX: kludge to avoid locking against ourselves
1593                          * in getcwd()
1594                          */
1595                         if (vxp->v_tag == VT_PROCFS) {
1596                                 *--bp = '/';
1597                         } else {
1598                                 rw_enter(&curproc->p_cwdi->cwdi_lock, RW_READER);
1599                                 vp = curproc->p_cwdi->cwdi_rdir;
1600                                 if (vp == NULL)
1601                                         vp = rootvnode;
1602                                 error = getcwd_common(vxp, vp, &bp, path,
1603                                     MAXPATHLEN / 2, 0, curlwp);
1604                                 rw_exit(&curproc->p_cwdi->cwdi_lock);
1605                         }
1606                         if (error)
1607                                 break;
1608                         len = strlen(bp);
1609                         break;
1610
1611                 case DTYPE_MISC:
1612                         len = snprintf(bf, sizeof(bf), "%s", "[misc]");
1613                         break;
1614
1615                 case DTYPE_KQUEUE:
1616                         len = snprintf(bf, sizeof(bf), "%s", "[kqueue]");
1617                         break;
1618
1619                 default:
1620                         error = EINVAL;
1621                         break;
1622                 }
1623                 closef(fp);
1624                 procfs_proc_unlock(pown);
1625         }
1626
1627         if (error == 0)
1628                 error = uiomove(bp, len, ap->a_uio);
1629         if (path)
1630                 free(path, M_TEMP);
1631         return error;
1632 }
1633
1634 /*
1635  * convert decimal ascii to int
1636  */
1637 static int
1638 atoi(const char *b, size_t len)
1639 {
1640         int p = 0;
1641
1642         while (len--) {
1643                 char c = *b++;
1644                 if (c < '0' || c > '9')
1645                         return -1;
1646                 p = 10 * p + (c - '0');
1647         }
1648
1649         return p;
1650 }