bsd/vfs/vfs_vnops.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  * (c) UNIX System Laboratories, Inc.
  33  * All or some portions of this file are derived from material licensed
  34  * to the University of California by American Telephone and Telegraph
  35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  36  * the permission of UNIX System Laboratories, Inc.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95
  67  *
  68  */
  69 /*
  70  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  71  * support for mandatory and extensible security protections.  This notice
  72  * is included in support of clause 2.2 (b) of the Apple Public License,
  73  * Version 2.0.
  74  */
  75
  76 #include <sys/param.h>
  77 #include <sys/types.h>
  78 #include <sys/systm.h>
  79 #include <sys/kernel.h>
  80 #include <sys/file_internal.h>
  81 #include <sys/stat.h>
  82 #include <sys/proc_internal.h>
  83 #include <sys/kauth.h>
  84 #include <sys/mount_internal.h>
  85 #include <sys/namei.h>
  86 #include <sys/vnode_internal.h>
  87 #include <sys/ioctl.h>
  88 #include <sys/tty.h>
  89 /* Temporary workaround for ubc.h until <rdar://4714366 is resolved */
  90 #define ubc_setcred ubc_setcred_deprecated
  91 #include <sys/ubc.h>
  92 #undef ubc_setcred
  93 int     ubc_setcred(struct vnode *, struct proc *);
  94 #include <sys/conf.h>
  95 #include <sys/disk.h>
  96 #include <sys/fsevents.h>
  97 #include <sys/kdebug.h>
  98 #include <sys/xattr.h>
  99 #include <sys/ubc_internal.h>
 100 #include <sys/uio_internal.h>
 101 #include <sys/resourcevar.h>
 102 #include <sys/signalvar.h>
 103
 104 #include <vm/vm_kern.h>
 105 #include <vm/vm_map.h>
 106
 107 #include <miscfs/specfs/specdev.h>
 108 #include <miscfs/fifofs/fifo.h>
 109
 110 #if CONFIG_MACF
 111 #include <security/mac_framework.h>
 112 #endif
 113
 114 #if CONFIG_PROTECT
 115 #include <sys/cprotect.h>
 116 #endif
 117
 118
 119 static int vn_closefile(struct fileglob *fp, vfs_context_t ctx);
 120 static int vn_ioctl(struct fileproc *fp, u_long com, caddr_t data,
 121                         vfs_context_t ctx);
 122 static int vn_read(struct fileproc *fp, struct uio *uio, int flags,
 123                         vfs_context_t ctx);
 124 static int vn_write(struct fileproc *fp, struct uio *uio, int flags,
 125                         vfs_context_t ctx);
 126 static int vn_select( struct fileproc *fp, int which, void * wql,
 127                         vfs_context_t ctx);
 128 static int vn_kqfilt_add(struct fileproc *fp, struct knote *kn,
 129                         vfs_context_t ctx);
 130 static void filt_vndetach(struct knote *kn);
 131 static int filt_vnode(struct knote *kn, long hint);
 132 static int vn_open_auth_finish(vnode_t vp, int fmode, vfs_context_t ctx);
 133 #if 0
 134 static int vn_kqfilt_remove(struct vnode *vp, uintptr_t ident,
 135                         vfs_context_t ctx);
 136 #endif
 137
 138 struct  fileops vnops =
 139         { vn_read, vn_write, vn_ioctl, vn_select, vn_closefile, vn_kqfilt_add, NULL };
 140
 141 struct  filterops vnode_filtops = {
 142         .f_isfd = 1,
 143         .f_attach = NULL,
 144         .f_detach = filt_vndetach,
 145         .f_event = filt_vnode
 146 };
 147
 148 /*
 149  * Common code for vnode open operations.
 150  * Check permissions, and call the VNOP_OPEN or VNOP_CREATE routine.
 151  *
 152  * XXX the profusion of interfaces here is probably a bad thing.
 153  */
 154 int
 155 vn_open(struct nameidata *ndp, int fmode, int cmode)
 156 {
 157         return(vn_open_modflags(ndp, &fmode, cmode));
 158 }
 159
 160 int
 161 vn_open_modflags(struct nameidata *ndp, int *fmodep, int cmode)
 162 {
 163         struct vnode_attr va;
 164
 165         VATTR_INIT(&va);
 166         VATTR_SET(&va, va_mode, cmode);
 167
 168         return(vn_open_auth(ndp, fmodep, &va));
 169 }
 170
 171 static int
 172 vn_open_auth_finish(vnode_t vp, int fmode, vfs_context_t ctx)
 173 {
 174         int error;
 175
 176         if ((error = vnode_ref_ext(vp, fmode, 0)) != 0) {
 177                 goto bad;
 178         }
 179
 180         /* call out to allow 3rd party notification of open.
 181          * Ignore result of kauth_authorize_fileop call.
 182          */
 183         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
 184                                                    (uintptr_t)vp, 0);
 185
 186         return 0;
 187
 188 bad:
 189         return error;
 190
 191 }
 192
 193 /*
 194  * May do nameidone() to allow safely adding an FSEvent.  Cue off of ni_dvp to
 195  * determine whether that has happened.
 196  */
 197 static int
 198 vn_open_auth_do_create(struct nameidata *ndp, struct vnode_attr *vap, int fmode, boolean_t *did_create, boolean_t *did_open, vfs_context_t ctx)
 199 {
 200         uint32_t status = 0;
 201         vnode_t dvp = ndp->ni_dvp;
 202         int batched;
 203         int error;
 204         vnode_t vp;
 205
 206         batched = vnode_compound_open_available(ndp->ni_dvp);
 207         *did_open = FALSE;
 208
 209         VATTR_SET(vap, va_type, VREG);
 210         if (fmode & O_EXCL)
 211                 vap->va_vaflags |= VA_EXCLUSIVE;
 212
 213 #if NAMEDRSRCFORK
 214         if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
 215                 if ((error = vn_authorize_create(dvp, &ndp->ni_cnd, vap, ctx, NULL)) != 0)
 216                         goto out;
 217                 if ((error = vnode_makenamedstream(dvp, &ndp->ni_vp, XATTR_RESOURCEFORK_NAME, 0, ctx)) != 0)
 218                         goto out;
 219                 *did_create = TRUE;
 220         } else {
 221 #endif
 222                 if (!batched) {
 223                         if ((error = vn_authorize_create(dvp, &ndp->ni_cnd, vap, ctx, NULL)) != 0)
 224                                 goto out;
 225                 }
 226
 227                 error = vn_create(dvp, &ndp->ni_vp, ndp, vap, VN_CREATE_DOOPEN, fmode, &status, ctx);
 228                 if (error != 0) {
 229                         if (batched) {
 230                                 *did_create = (status & COMPOUND_OPEN_STATUS_DID_CREATE) ? TRUE : FALSE;
 231                         } else {
 232                                 *did_create = FALSE;
 233                         }
 234
 235                         if (error == EKEEPLOOKING) {
 236                                 if (*did_create) {
 237                                         panic("EKEEPLOOKING, but we did a create?");
 238                                 }
 239                                 if (!batched) {
 240                                         panic("EKEEPLOOKING from filesystem that doesn't support compound vnops?");
 241                                 }
 242                                 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
 243                                         panic("EKEEPLOOKING, but continue flag not set?");
 244                                 }
 245
 246                                 /*
 247                                  * Do NOT drop the dvp: we need everything to continue the lookup.
 248                                  */
 249                                 return error;
 250                         }
 251                 } else {
 252                         if (batched) {
 253                                 *did_create = (status & COMPOUND_OPEN_STATUS_DID_CREATE) ? 1 : 0;
 254                                 *did_open = TRUE;
 255                         } else {
 256                                 *did_create = TRUE;
 257                         }
 258                 }
 259 #if NAMEDRSRCFORK
 260         }
 261 #endif
 262
 263         /*
 264         * Unlock the fsnode (if locked) here so that we are free
 265         * to drop the dvp iocount and prevent deadlock in build_path().
 266         * nameidone() will still do the right thing later.
 267         */
 268         vp = ndp->ni_vp;
 269         namei_unlock_fsnode(ndp);
 270
 271         if (*did_create) {
 272                 int     update_flags = 0;
 273
 274                 // Make sure the name & parent pointers are hooked up
 275                 if (vp->v_name == NULL)
 276                         update_flags |= VNODE_UPDATE_NAME;
 277                 if (vp->v_parent == NULLVP)
 278                         update_flags |= VNODE_UPDATE_PARENT;
 279
 280                 if (update_flags)
 281                         vnode_update_identity(vp, dvp, ndp->ni_cnd.cn_nameptr, ndp->ni_cnd.cn_namelen, ndp->ni_cnd.cn_hash, update_flags);
 282
 283                 vnode_put(dvp);
 284                 ndp->ni_dvp = NULLVP;
 285
 286 #if CONFIG_FSE
 287                 if (need_fsevent(FSE_CREATE_FILE, vp)) {
 288                         add_fsevent(FSE_CREATE_FILE, ctx,
 289                                         FSE_ARG_VNODE, vp,
 290                                         FSE_ARG_DONE);
 291                 }
 292 #endif
 293         }
 294 out:
 295         if (ndp->ni_dvp != NULLVP) {
 296                 vnode_put(dvp);
 297                 ndp->ni_dvp = NULLVP;
 298         }
 299
 300         return error;
 301 }
 302
 303 /*
 304  * Open a file with authorization, updating the contents of the structures
 305  * pointed to by ndp, fmodep, and vap as necessary to perform the requested
 306  * operation.  This function is used for both opens of existing files, and
 307  * creation of new files.
 308  *
 309  * Parameters:  ndp                     The nami data pointer describing the
 310  *                                      file
 311  *              fmodep                  A pointer to an int containg the mode
 312  *                                      information to be used for the open
 313  *              vap                     A pointer to the vnode attribute
 314  *                                      descriptor to be used for the open
 315  *
 316  * Indirect:    *                       Contents of the data structures pointed
 317  *                                      to by the parameters are modified as
 318  *                                      necessary to the requested operation.
 319  *
 320  * Returns:     0                       Success
 321  *              !0                      errno value
 322  *
 323  * Notes:       The kauth_filesec_t in 'vap', if any, is in host byte order.
 324  *
 325  *              The contents of '*ndp' will be modified, based on the other
 326  *              arguments to this function, and to return file and directory
 327  *              data necessary to satisfy the requested operation.
 328  *
 329  *              If the file does not exist and we are creating it, then the
 330  *              O_TRUNC flag will be cleared in '*fmodep' to indicate to the
 331  *              caller that the file was not truncated.
 332  *
 333  *              If the file exists and the O_EXCL flag was not specified, then
 334  *              the O_CREAT flag will be cleared in '*fmodep' to indicate to
 335  *              the caller that the existing file was merely opened rather
 336  *              than created.
 337  *
 338  *              The contents of '*vap' will be modified as necessary to
 339  *              complete the operation, including setting of supported
 340  *              attribute, clearing of fields containing unsupported attributes
 341  *              in the request, if the request proceeds without them, etc..
 342  *
 343  * XXX:         This function is too complicated in actings on its arguments
 344  *
 345  * XXX:         We should enummerate the possible errno values here, and where
 346  *              in the code they originated.
 347  */
 348 int
 349 vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap)
 350 {
 351         struct vnode *vp;
 352         struct vnode *dvp;
 353         vfs_context_t ctx = ndp->ni_cnd.cn_context;
 354         int error;
 355         int fmode;
 356         uint32_t origcnflags;
 357         boolean_t did_create;
 358         boolean_t did_open;
 359         boolean_t need_vnop_open;
 360         boolean_t batched;
 361         boolean_t ref_failed;
 362
 363 again:
 364         vp = NULL;
 365         dvp = NULL;
 366         batched = FALSE;
 367         did_create = FALSE;
 368         need_vnop_open = TRUE;
 369         ref_failed = FALSE;
 370         fmode = *fmodep;
 371         origcnflags = ndp->ni_cnd.cn_flags;
 372
 373         /*
 374          * O_CREAT
 375          */
 376         if (fmode & O_CREAT) {
 377                 if ( (fmode & O_DIRECTORY) ) {
 378                         error = EINVAL;
 379                         goto out;
 380                 }
 381                 ndp->ni_cnd.cn_nameiop = CREATE;
 382 #if CONFIG_TRIGGERS
 383                 ndp->ni_op = OP_LINK;
 384 #endif
 385                 /* Inherit USEDVP, vnode_open() supported flags only */
 386                 ndp->ni_cnd.cn_flags &= (USEDVP | NOCROSSMOUNT | DOWHITEOUT);
 387                 ndp->ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF | AUDITVNPATH1;
 388                 ndp->ni_flag = NAMEI_COMPOUNDOPEN;
 389 #if NAMEDRSRCFORK
 390                 /* open calls are allowed for resource forks. */
 391                 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
 392 #endif
 393                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0 && (origcnflags & FOLLOW) != 0)
 394                         ndp->ni_cnd.cn_flags |= FOLLOW;
 395
 396 continue_create_lookup:
 397                 if ( (error = namei(ndp)) )
 398                         goto out;
 399
 400                 dvp = ndp->ni_dvp;
 401                 vp = ndp->ni_vp;
 402
 403                 batched = vnode_compound_open_available(dvp);
 404
 405                 /* not found, create */
 406                 if (vp == NULL) {
 407                         /* must have attributes for a new file */
 408                         if (vap == NULL) {
 409                                 error = EINVAL;
 410                                 goto out;
 411                         }
 412                         /*
 413                          * Attempt a create.   For a system supporting compound VNOPs, we may
 414                          * find an existing file or create one; in either case, we will already
 415                          * have the file open and no VNOP_OPEN() will be needed.
 416                          */
 417                         error = vn_open_auth_do_create(ndp, vap, fmode, &did_create, &did_open, ctx);
 418
 419                         dvp = ndp->ni_dvp;
 420                         vp = ndp->ni_vp;
 421
 422                         /*
 423                          * Detected a node that the filesystem couldn't handle.  Don't call
 424                          * nameidone() yet, because we need that path buffer.
 425                          */
 426                         if (error == EKEEPLOOKING) {
 427                                 if (!batched) {
 428                                         panic("EKEEPLOOKING from a filesystem that doesn't support compound VNOPs?");
 429                                 }
 430                                 goto continue_create_lookup;
 431                         }
 432
 433                         nameidone(ndp);
 434                         if (dvp) {
 435                                 panic("Shouldn't have a dvp here.");
 436                         }
 437
 438                         if (error) {
 439                                 /*
 440                                  * Check for a creation or unlink race.
 441                                  */
 442                                 if (((error == EEXIST) && !(fmode & O_EXCL)) ||
 443                                                 ((error == ENOENT) && (fmode & O_CREAT))){
 444                                         if (vp)
 445                                                 vnode_put(vp);
 446                                         goto again;
 447                                 }
 448                                 goto bad;
 449                         }
 450
 451                         need_vnop_open = !did_open;
 452                 }
 453                 else {
 454                         if (fmode & O_EXCL)
 455                                 error = EEXIST;
 456
 457                         /*
 458                          * We have a vnode.  Use compound open if available
 459                          * or else fall through to "traditional" path.  Note: can't
 460                          * do a compound open for root, because the parent belongs
 461                          * to a different FS.
 462                          */
 463                         if (error == 0 && batched && (vnode_mount(dvp) == vnode_mount(vp))) {
 464                                 error = VNOP_COMPOUND_OPEN(dvp, &ndp->ni_vp, ndp, 0, fmode, NULL, NULL, ctx);
 465
 466                                 if (error == 0) {
 467                                         vp = ndp->ni_vp;
 468                                         need_vnop_open = FALSE;
 469                                 } else if (error == EKEEPLOOKING) {
 470                                         if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
 471                                                 panic("EKEEPLOOKING, but continue flag not set?");
 472                                         }
 473                                         goto continue_create_lookup;
 474                                 }
 475                         }
 476                         nameidone(ndp);
 477                         vnode_put(dvp);
 478                         ndp->ni_dvp = NULLVP;
 479
 480                         if (error) {
 481                                 goto bad;
 482                         }
 483
 484                         fmode &= ~O_CREAT;
 485
 486                         /* Fall through */
 487                 }
 488         } else {
 489                 /*
 490                  * Not O_CREAT
 491                  */
 492                 ndp->ni_cnd.cn_nameiop = LOOKUP;
 493                 /* Inherit USEDVP, vnode_open() supported flags only */
 494                 ndp->ni_cnd.cn_flags &= (USEDVP | NOCROSSMOUNT | DOWHITEOUT);
 495                 ndp->ni_cnd.cn_flags |= FOLLOW | LOCKLEAF | AUDITVNPATH1 | WANTPARENT;
 496 #if NAMEDRSRCFORK
 497                 /* open calls are allowed for resource forks. */
 498                 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
 499 #endif
 500                 ndp->ni_flag = NAMEI_COMPOUNDOPEN;
 501
 502                 /* preserve NOFOLLOW from vnode_open() */
 503                 if (fmode & O_NOFOLLOW || fmode & O_SYMLINK || (origcnflags & FOLLOW) == 0) {
 504                         ndp->ni_cnd.cn_flags &= ~FOLLOW;
 505                 }
 506
 507                 /* Do a lookup, possibly going directly to filesystem for compound operation */
 508                 do {
 509                         if ( (error = namei(ndp)) )
 510                                 goto out;
 511                         vp = ndp->ni_vp;
 512                         dvp = ndp->ni_dvp;
 513
 514                         /* Check for batched lookup-open */
 515                         batched = vnode_compound_open_available(dvp);
 516                         if (batched && ((vp == NULLVP) || (vnode_mount(dvp) == vnode_mount(vp)))) {
 517                                 error = VNOP_COMPOUND_OPEN(dvp, &ndp->ni_vp, ndp, 0, fmode, NULL, NULL, ctx);
 518                                 vp = ndp->ni_vp;
 519                                 if (error == 0) {
 520                                         need_vnop_open = FALSE;
 521                                 } else if (error == EKEEPLOOKING) {
 522                                         if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
 523                                                 panic("EKEEPLOOKING, but continue flag not set?");
 524                                         }
 525                                 }
 526                         }
 527                 } while (error == EKEEPLOOKING);
 528
 529                 nameidone(ndp);
 530                 vnode_put(dvp);
 531                 ndp->ni_dvp = NULLVP;
 532
 533                 if (error) {
 534                         goto bad;
 535                 }
 536         }
 537
 538         /*
 539          * By this point, nameidone() is called, dvp iocount is dropped,
 540          * and dvp pointer is cleared.
 541          */
 542         if (ndp->ni_dvp != NULLVP) {
 543                 panic("Haven't cleaned up adequately in vn_open_auth()");
 544         }
 545
 546         /*
 547          * Expect to use this code for filesystems without compound VNOPs, for the root
 548          * of a filesystem, which can't be "looked up" in the sense of VNOP_LOOKUP(),
 549          * and for shadow files, which do not live on the same filesystems as their "parents."
 550          */
 551         if (need_vnop_open) {
 552                 if (batched && !vnode_isvroot(vp) && !vnode_isnamedstream(vp)) {
 553                         panic("Why am I trying to use VNOP_OPEN() on anything other than the root or a named stream?");
 554                 }
 555
 556                 if (!did_create) {
 557                         error = vn_authorize_open_existing(vp, &ndp->ni_cnd, fmode, ctx, NULL);
 558                         if (error) {
 559                                 goto bad;
 560                         }
 561                 }
 562
 563 #if CONFIG_PROTECT
 564                 /*
 565                  * Perform any content protection access checks prior to calling
 566                  * into the filesystem, if the raw encrypted mode was not
 567                  * requested.
 568                  *
 569                  * If the va_dataprotect_flags are NOT active, or if they are,
 570                  * but they do not have the VA_DP_RAWENCRYPTED bit set, then we need
 571                  * to perform the checks.
 572                  */
 573                 if (!(VATTR_IS_ACTIVE (vap, va_dataprotect_flags)) ||
 574                                 ((vap->va_dataprotect_flags & VA_DP_RAWENCRYPTED) == 0)) {
 575                         error = cp_handle_open (vp, fmode);
 576                         if (error) {
 577                                 goto bad;
 578                         }
 579                 }
 580 #endif
 581
 582                 error = VNOP_OPEN(vp, fmode, ctx);
 583                 if (error) {
 584                         goto bad;
 585                 }
 586                 need_vnop_open = FALSE;
 587         }
 588
 589         // if the vnode is tagged VOPENEVT and the current process
 590         // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
 591         // flag to the open mode so that this open won't count against
 592         // the vnode when carbon delete() does a vnode_isinuse() to see
 593         // if a file is currently in use.  this allows spotlight
 594         // importers to not interfere with carbon apps that depend on
 595         // the no-delete-if-busy semantics of carbon delete().
 596         //
 597         if (!did_create && (vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
 598                 fmode |= O_EVTONLY;
 599         }
 600
 601         /*
 602          * Grab reference, etc.
 603          */
 604         error = vn_open_auth_finish(vp, fmode, ctx);
 605         if (error) {
 606                 ref_failed = TRUE;
 607                 goto bad;
 608         }
 609
 610         /* Compound VNOP open is responsible for doing the truncate */
 611         if (batched || did_create)
 612                 fmode &= ~O_TRUNC;
 613
 614         *fmodep = fmode;
 615         return (0);
 616
 617 bad:
 618         /* Opened either explicitly or by a batched create */
 619         if (!need_vnop_open) {
 620                 VNOP_CLOSE(vp, fmode, ctx);
 621         }
 622
 623         ndp->ni_vp = NULL;
 624         if (vp) {
 625 #if NAMEDRSRCFORK
 626                 /* Aggressively recycle shadow files if we error'd out during open() */
 627                 if ((vnode_isnamedstream(vp)) &&
 628                         (vp->v_parent != NULLVP) &&
 629                         (vnode_isshadow(vp))) {
 630                                 vnode_recycle(vp);
 631                 }
 632 #endif
 633                 vnode_put(vp);
 634                 /*
 635                  * Check for a race against unlink.  We had a vnode
 636                  * but according to vnode_authorize or VNOP_OPEN it
 637                  * no longer exists.
 638                  *
 639                  * EREDRIVEOPEN: means that we were hit by the tty allocation race.
 640                  */
 641                 if (((error == ENOENT) && (*fmodep & O_CREAT)) || (error == EREDRIVEOPEN) || ref_failed) {
 642                         goto again;
 643                 }
 644         }
 645
 646 out:
 647         return (error);
 648 }
 649
 650 #if vn_access_DEPRECATED
 651 /*
 652  * Authorize an action against a vnode.  This has been the canonical way to
 653  * ensure that the credential/process/etc. referenced by a vfs_context
 654  * is granted the rights called out in 'mode' against the vnode 'vp'.
 655  *
 656  * Unfortunately, the use of VREAD/VWRITE/VEXEC makes it very difficult
 657  * to add support for more rights.  As such, this interface will be deprecated
 658  * and callers will use vnode_authorize instead.
 659  */
 660 int
 661 vn_access(vnode_t vp, int mode, vfs_context_t context)
 662 {
 663         kauth_action_t  action;
 664
 665         action = 0;
 666         if (mode & VREAD)
 667                 action |= KAUTH_VNODE_READ_DATA;
 668         if (mode & VWRITE)
 669                 action |= KAUTH_VNODE_WRITE_DATA;
 670         if (mode & VEXEC)
 671                 action |= KAUTH_VNODE_EXECUTE;
 672
 673         return(vnode_authorize(vp, NULL, action, context));
 674 }
 675 #endif  /* vn_access_DEPRECATED */
 676
 677 /*
 678  * Vnode close call
 679  */
 680 int
 681 vn_close(struct vnode *vp, int flags, vfs_context_t ctx)
 682 {
 683         int error;
 684
 685 #if NAMEDRSRCFORK
 686         /* Sync data from resource fork shadow file if needed. */
 687         if ((vp->v_flag & VISNAMEDSTREAM) &&
 688             (vp->v_parent != NULLVP) &&
 689             vnode_isshadow(vp)) {
 690                 if (flags & FWASWRITTEN) {
 691                         (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
 692                 }
 693         }
 694 #endif
 695
 696         /* work around for foxhound */
 697         if (vnode_isspec(vp))
 698                 (void)vnode_rele_ext(vp, flags, 0);
 699
 700         error = VNOP_CLOSE(vp, flags, ctx);
 701
 702 #if CONFIG_FSE
 703         if (flags & FWASWRITTEN) {
 704                 if (need_fsevent(FSE_CONTENT_MODIFIED, vp)) {
 705                         add_fsevent(FSE_CONTENT_MODIFIED, ctx,
 706                                     FSE_ARG_VNODE, vp,
 707                                     FSE_ARG_DONE);
 708                 }
 709         }
 710 #endif
 711
 712         if (!vnode_isspec(vp))
 713                 (void)vnode_rele_ext(vp, flags, 0);
 714
 715         return (error);
 716 }
 717
 718 static int
 719 vn_read_swapfile(
 720         struct vnode    *vp,
 721         uio_t           uio)
 722 {
 723         int     error;
 724         off_t   swap_count, this_count;
 725         off_t   file_end, read_end;
 726         off_t   prev_resid;
 727         char    *my_swap_page;
 728
 729         /*
 730          * Reading from a swap file will get you zeroes.
 731          */
 732
 733         my_swap_page = NULL;
 734         error = 0;
 735         swap_count = uio_resid(uio);
 736
 737         file_end = ubc_getsize(vp);
 738         read_end = uio->uio_offset + uio_resid(uio);
 739         if (uio->uio_offset >= file_end) {
 740                 /* uio starts after end of file: nothing to read */
 741                 swap_count = 0;
 742         } else if (read_end > file_end) {
 743                 /* uio extends beyond end of file: stop before that */
 744                 swap_count -= (read_end - file_end);
 745         }
 746
 747         while (swap_count > 0) {
 748                 if (my_swap_page == NULL) {
 749                         MALLOC(my_swap_page, char *, PAGE_SIZE,
 750                                M_TEMP, M_WAITOK);
 751                         memset(my_swap_page, '\0', PAGE_SIZE);
 752                         /* add an end-of-line to keep line counters happy */
 753                         my_swap_page[PAGE_SIZE-1] = '\n';
 754                 }
 755                 this_count = swap_count;
 756                 if (this_count > PAGE_SIZE) {
 757                         this_count = PAGE_SIZE;
 758                 }
 759
 760                 prev_resid = uio_resid(uio);
 761                 error = uiomove((caddr_t) my_swap_page,
 762                                 this_count,
 763                                 uio);
 764                 if (error) {
 765                         break;
 766                 }
 767                 swap_count -= (prev_resid - uio_resid(uio));
 768         }
 769         if (my_swap_page != NULL) {
 770                 FREE(my_swap_page, M_TEMP);
 771                 my_swap_page = NULL;
 772         }
 773
 774         return error;
 775 }
 776 /*
 777  * Package up an I/O request on a vnode into a uio and do it.
 778  */
 779 int
 780 vn_rdwr(
 781         enum uio_rw rw,
 782         struct vnode *vp,
 783         caddr_t base,
 784         int len,
 785         off_t offset,
 786         enum uio_seg segflg,
 787         int ioflg,
 788         kauth_cred_t cred,
 789         int *aresid,
 790         proc_t p)
 791 {
 792         int64_t resid;
 793         int result;
 794
 795         result = vn_rdwr_64(rw,
 796                         vp,
 797                         (uint64_t)(uintptr_t)base,
 798                         (int64_t)len,
 799                         offset,
 800                         segflg,
 801                         ioflg,
 802                         cred,
 803                         &resid,
 804                         p);
 805
 806         /* "resid" should be bounded above by "len," which is an int */
 807         if (aresid != NULL) {
 808                 *aresid = resid;
 809         }
 810
 811         return result;
 812 }
 813
 814
 815 int
 816 vn_rdwr_64(
 817         enum uio_rw rw,
 818         struct vnode *vp,
 819         uint64_t base,
 820         int64_t len,
 821         off_t offset,
 822         enum uio_seg segflg,
 823         int ioflg,
 824         kauth_cred_t cred,
 825         int64_t *aresid,
 826         proc_t p)
 827 {
 828         uio_t auio;
 829         int spacetype;
 830         struct vfs_context context;
 831         int error=0;
 832         char uio_buf[ UIO_SIZEOF(1) ];
 833
 834         context.vc_thread = current_thread();
 835         context.vc_ucred = cred;
 836
 837         if (UIO_SEG_IS_USER_SPACE(segflg)) {
 838                 spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
 839         }
 840         else {
 841                 spacetype = UIO_SYSSPACE;
 842         }
 843         auio = uio_createwithbuffer(1, offset, spacetype, rw,
 844                                                                   &uio_buf[0], sizeof(uio_buf));
 845         uio_addiov(auio, base, len);
 846
 847 #if CONFIG_MACF
 848         /* XXXMAC
 849          *      IO_NOAUTH should be re-examined.
 850          *      Likely that mediation should be performed in caller.
 851          */
 852         if ((ioflg & IO_NOAUTH) == 0) {
 853         /* passed cred is fp->f_cred */
 854                 if (rw == UIO_READ)
 855                         error = mac_vnode_check_read(&context, cred, vp);
 856                 else
 857                         error = mac_vnode_check_write(&context, cred, vp);
 858         }
 859 #endif
 860
 861         if (error == 0) {
 862                 if (rw == UIO_READ) {
 863                         if (vnode_isswap(vp)) {
 864                                 error = vn_read_swapfile(vp, auio);
 865                         } else {
 866                                 error = VNOP_READ(vp, auio, ioflg, &context);
 867                         }
 868                 } else {
 869                         error = VNOP_WRITE(vp, auio, ioflg, &context);
 870                 }
 871         }
 872
 873         if (aresid)
 874                 *aresid = uio_resid(auio);
 875         else
 876                 if (uio_resid(auio) && error == 0)
 877                         error = EIO;
 878         return (error);
 879 }
 880
 881 /*
 882  * File table vnode read routine.
 883  */
 884 static int
 885 vn_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx)
 886 {
 887         struct vnode *vp;
 888         int error, ioflag;
 889         off_t count;
 890
 891         vp = (struct vnode *)fp->f_fglob->fg_data;
 892         if ( (error = vnode_getwithref(vp)) ) {
 893                 return(error);
 894         }
 895
 896 #if CONFIG_MACF
 897         error = mac_vnode_check_read(ctx, vfs_context_ucred(ctx), vp);
 898         if (error) {
 899                 (void)vnode_put(vp);
 900                 return (error);
 901         }
 902 #endif
 903
 904         /* This signals to VNOP handlers that this read came from a file table read */
 905         ioflag = IO_SYSCALL_DISPATCH;
 906
 907         if (fp->f_fglob->fg_flag & FNONBLOCK)
 908                 ioflag |= IO_NDELAY;
 909         if ((fp->f_fglob->fg_flag & FNOCACHE) || vnode_isnocache(vp))
 910             ioflag |= IO_NOCACHE;
 911         if (fp->f_fglob->fg_flag & FENCRYPTED) {
 912                 ioflag |= IO_ENCRYPTED;
 913         }
 914         if (fp->f_fglob->fg_flag & FNORDAHEAD)
 915             ioflag |= IO_RAOFF;
 916
 917         if ((flags & FOF_OFFSET) == 0)
 918                 uio->uio_offset = fp->f_fglob->fg_offset;
 919         count = uio_resid(uio);
 920
 921         if (vnode_isswap(vp)) {
 922                 /* special case for swap files */
 923                 error = vn_read_swapfile(vp, uio);
 924         } else {
 925                 error = VNOP_READ(vp, uio, ioflag, ctx);
 926         }
 927         if ((flags & FOF_OFFSET) == 0)
 928                 fp->f_fglob->fg_offset += count - uio_resid(uio);
 929
 930         (void)vnode_put(vp);
 931         return (error);
 932 }
 933
 934
 935 /*
 936  * File table vnode write routine.
 937  */
 938 static int
 939 vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx)
 940 {
 941         struct vnode *vp;
 942         int error, ioflag;
 943         off_t count;
 944         int clippedsize = 0;
 945         int partialwrite=0;
 946         int residcount, oldcount;
 947         proc_t p = vfs_context_proc(ctx);
 948
 949         count = 0;
 950         vp = (struct vnode *)fp->f_fglob->fg_data;
 951         if ( (error = vnode_getwithref(vp)) ) {
 952                 return(error);
 953         }
 954
 955 #if CONFIG_MACF
 956         error = mac_vnode_check_write(ctx, vfs_context_ucred(ctx), vp);
 957         if (error) {
 958                 (void)vnode_put(vp);
 959                 return (error);
 960         }
 961 #endif
 962
 963         /*
 964          * IO_SYSCALL_DISPATCH signals to VNOP handlers that this write originated
 965          * from a file table write.
 966          */
 967         ioflag = (IO_UNIT | IO_SYSCALL_DISPATCH);
 968
 969         if (vp->v_type == VREG && (fp->f_fglob->fg_flag & O_APPEND))
 970                 ioflag |= IO_APPEND;
 971         if (fp->f_fglob->fg_flag & FNONBLOCK)
 972                 ioflag |= IO_NDELAY;
 973         if ((fp->f_fglob->fg_flag & FNOCACHE) || vnode_isnocache(vp))
 974                 ioflag |= IO_NOCACHE;
 975         if (fp->f_fglob->fg_flag & FNODIRECT)
 976                 ioflag |= IO_NODIRECT;
 977         if (fp->f_fglob->fg_flag & FSINGLE_WRITER)
 978                 ioflag |= IO_SINGLE_WRITER;
 979
 980         /*
 981          * Treat synchronous mounts and O_FSYNC on the fd as equivalent.
 982          *
 983          * XXX We treat O_DSYNC as O_FSYNC for now, since we can not delay
 984          * XXX the non-essential metadata without some additional VFS work;
 985          * XXX the intent at this point is to plumb the interface for it.
 986          */
 987         if ((fp->f_fglob->fg_flag & (O_FSYNC|O_DSYNC)) ||
 988                 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) {
 989                 ioflag |= IO_SYNC;
 990         }
 991
 992         if ((flags & FOF_OFFSET) == 0) {
 993                 uio->uio_offset = fp->f_fglob->fg_offset;
 994                 count = uio_resid(uio);
 995         }
 996         if (((flags & FOF_OFFSET) == 0) &&
 997                 vfs_context_proc(ctx) && (vp->v_type == VREG) &&
 998             (((rlim_t)(uio->uio_offset + uio_resid(uio)) > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) ||
 999              ((rlim_t)uio_resid(uio) > (p->p_rlimit[RLIMIT_FSIZE].rlim_cur - uio->uio_offset)))) {
1000                 /*
1001                  * If the requested residual would cause us to go past the
1002                  * administrative limit, then we need to adjust the residual
1003                  * down to cause fewer bytes than requested to be written.  If
1004                  * we can't do that (e.g. the residual is already 1 byte),
1005                  * then we fail the write with EFBIG.
1006                  */
1007                 residcount = uio_resid(uio);
1008                 if ((rlim_t)(uio->uio_offset + uio_resid(uio)) > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
1009                         clippedsize =  (uio->uio_offset + uio_resid(uio)) - p->p_rlimit[RLIMIT_FSIZE].rlim_cur;
1010                 } else if ((rlim_t)uio_resid(uio) > (p->p_rlimit[RLIMIT_FSIZE].rlim_cur - uio->uio_offset)) {
1011                         clippedsize = (p->p_rlimit[RLIMIT_FSIZE].rlim_cur - uio->uio_offset);
1012                 }
1013                 if (clippedsize >= residcount) {
1014                         psignal(p, SIGXFSZ);
1015                         vnode_put(vp);
1016                         return (EFBIG);
1017                 }
1018                 partialwrite = 1;
1019                 uio_setresid(uio, residcount-clippedsize);
1020         }
1021         if ((flags & FOF_OFFSET) != 0) {
1022                 /* for pwrite, append should  be ignored */
1023                 ioflag &= ~IO_APPEND;
1024                 if (p && (vp->v_type == VREG) &&
1025                 ((rlim_t)uio->uio_offset  >= p->p_rlimit[RLIMIT_FSIZE].rlim_cur)) {
1026                 psignal(p, SIGXFSZ);
1027                 vnode_put(vp);
1028                 return (EFBIG);
1029         }
1030                 if (p && (vp->v_type == VREG) &&
1031                         ((rlim_t)(uio->uio_offset + uio_resid(uio)) > p->p_rlimit[RLIMIT_FSIZE].rlim_cur)) {
1032                         //Debugger("vn_bwrite:overstepping the bounds");
1033                         residcount = uio_resid(uio);
1034                         clippedsize =  (uio->uio_offset + uio_resid(uio)) - p->p_rlimit[RLIMIT_FSIZE].rlim_cur;
1035                         partialwrite = 1;
1036                         uio_setresid(uio, residcount-clippedsize);
1037                 }
1038         }
1039
1040         error = VNOP_WRITE(vp, uio, ioflag, ctx);
1041
1042         if (partialwrite) {
1043                 oldcount = uio_resid(uio);
1044                 uio_setresid(uio, oldcount + clippedsize);
1045         }
1046
1047         if ((flags & FOF_OFFSET) == 0) {
1048                 if (ioflag & IO_APPEND)
1049                         fp->f_fglob->fg_offset = uio->uio_offset;
1050                 else
1051                         fp->f_fglob->fg_offset += count - uio_resid(uio);
1052         }
1053
1054         /*
1055          * Set the credentials on successful writes
1056          */
1057         if ((error == 0) && (vp->v_tag == VT_NFS) && (UBCINFOEXISTS(vp))) {
1058                 /*
1059                  * When called from aio subsystem, we only have the proc from
1060                  * which to get the credential, at this point, so use that
1061                  * instead.  This means aio functions are incompatible with
1062                  * per-thread credentials (aio operations are proxied).  We
1063                  * can't easily correct the aio vs. settid race in this case
1064                  * anyway, so we disallow it.
1065                  */
1066                 if ((flags & FOF_PCRED) == 0) {
1067                         ubc_setthreadcred(vp, p, current_thread());
1068                 } else {
1069                         ubc_setcred(vp, p);
1070                 }
1071         }
1072         (void)vnode_put(vp);
1073         return (error);
1074 }
1075
1076 /*
1077  * File table vnode stat routine.
1078  *
1079  * Returns:     0                       Success
1080  *              EBADF
1081  *              ENOMEM
1082  *      vnode_getattr:???
1083  */
1084 int
1085 vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat64, vfs_context_t ctx)
1086 {
1087         struct vnode_attr va;
1088         int error;
1089         u_short mode;
1090         kauth_filesec_t fsec;
1091         struct stat *sb = (struct stat *)0;     /* warning avoidance ; protected by isstat64 */
1092         struct stat64 * sb64 = (struct stat64 *)0;  /* warning avoidance ; protected by isstat64 */
1093
1094         if (isstat64 != 0)
1095                 sb64 = (struct stat64 *)sbptr;
1096         else
1097                 sb = (struct stat *)sbptr;
1098         memset(&va, 0, sizeof(va));
1099         VATTR_INIT(&va);
1100         VATTR_WANTED(&va, va_fsid);
1101         VATTR_WANTED(&va, va_fileid);
1102         VATTR_WANTED(&va, va_mode);
1103         VATTR_WANTED(&va, va_type);
1104         VATTR_WANTED(&va, va_nlink);
1105         VATTR_WANTED(&va, va_uid);
1106         VATTR_WANTED(&va, va_gid);
1107         VATTR_WANTED(&va, va_rdev);
1108         VATTR_WANTED(&va, va_data_size);
1109         VATTR_WANTED(&va, va_access_time);
1110         VATTR_WANTED(&va, va_modify_time);
1111         VATTR_WANTED(&va, va_change_time);
1112         VATTR_WANTED(&va, va_create_time);
1113         VATTR_WANTED(&va, va_flags);
1114         VATTR_WANTED(&va, va_gen);
1115         VATTR_WANTED(&va, va_iosize);
1116         /* lower layers will synthesise va_total_alloc from va_data_size if required */
1117         VATTR_WANTED(&va, va_total_alloc);
1118         if (xsec != NULL) {
1119                 VATTR_WANTED(&va, va_uuuid);
1120                 VATTR_WANTED(&va, va_guuid);
1121                 VATTR_WANTED(&va, va_acl);
1122         }
1123         error = vnode_getattr(vp, &va, ctx);
1124         if (error)
1125                 goto out;
1126         /*
1127          * Copy from vattr table
1128          */
1129         if (isstat64 != 0) {
1130                 sb64->st_dev = va.va_fsid;
1131                 sb64->st_ino = (ino64_t)va.va_fileid;
1132
1133         } else {
1134                 sb->st_dev = va.va_fsid;
1135                 sb->st_ino = (ino_t)va.va_fileid;
1136         }
1137         mode = va.va_mode;
1138         switch (vp->v_type) {
1139         case VREG:
1140                 mode |= S_IFREG;
1141                 break;
1142         case VDIR:
1143                 mode |= S_IFDIR;
1144                 break;
1145         case VBLK:
1146                 mode |= S_IFBLK;
1147                 break;
1148         case VCHR:
1149                 mode |= S_IFCHR;
1150                 break;
1151         case VLNK:
1152                 mode |= S_IFLNK;
1153                 break;
1154         case VSOCK:
1155                 mode |= S_IFSOCK;
1156                 break;
1157         case VFIFO:
1158                 mode |= S_IFIFO;
1159                 break;
1160         default:
1161                 error = EBADF;
1162                 goto out;
1163         };
1164         if (isstat64 != 0) {
1165                 sb64->st_mode = mode;
1166                 sb64->st_nlink = VATTR_IS_SUPPORTED(&va, va_nlink) ? (u_int16_t)va.va_nlink : 1;
1167                 sb64->st_uid = va.va_uid;
1168                 sb64->st_gid = va.va_gid;
1169                 sb64->st_rdev = va.va_rdev;
1170                 sb64->st_size = va.va_data_size;
1171                 sb64->st_atimespec = va.va_access_time;
1172                 sb64->st_mtimespec = va.va_modify_time;
1173                 sb64->st_ctimespec = va.va_change_time;
1174                 sb64->st_birthtimespec =
1175                                 VATTR_IS_SUPPORTED(&va, va_create_time) ? va.va_create_time : va.va_change_time;
1176                 sb64->st_blksize = va.va_iosize;
1177                 sb64->st_flags = va.va_flags;
1178                 sb64->st_blocks = roundup(va.va_total_alloc, 512) / 512;
1179         } else {
1180                 sb->st_mode = mode;
1181                 sb->st_nlink = VATTR_IS_SUPPORTED(&va, va_nlink) ? (u_int16_t)va.va_nlink : 1;
1182                 sb->st_uid = va.va_uid;
1183                 sb->st_gid = va.va_gid;
1184                 sb->st_rdev = va.va_rdev;
1185                 sb->st_size = va.va_data_size;
1186                 sb->st_atimespec = va.va_access_time;
1187                 sb->st_mtimespec = va.va_modify_time;
1188                 sb->st_ctimespec = va.va_change_time;
1189                 sb->st_blksize = va.va_iosize;
1190                 sb->st_flags = va.va_flags;
1191                 sb->st_blocks = roundup(va.va_total_alloc, 512) / 512;
1192         }
1193
1194         /* if we're interested in extended security data and we got an ACL */
1195         if (xsec != NULL) {
1196                 if (!VATTR_IS_SUPPORTED(&va, va_acl) &&
1197                     !VATTR_IS_SUPPORTED(&va, va_uuuid) &&
1198                     !VATTR_IS_SUPPORTED(&va, va_guuid)) {
1199                         *xsec = KAUTH_FILESEC_NONE;
1200                 } else {
1201
1202                         if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) {
1203                                 fsec = kauth_filesec_alloc(va.va_acl->acl_entrycount);
1204                         } else {
1205                                 fsec = kauth_filesec_alloc(0);
1206                         }
1207                         if (fsec == NULL) {
1208                                 error = ENOMEM;
1209                                 goto out;
1210                         }
1211                         fsec->fsec_magic = KAUTH_FILESEC_MAGIC;
1212                         if (VATTR_IS_SUPPORTED(&va, va_uuuid)) {
1213                                 fsec->fsec_owner = va.va_uuuid;
1214                         } else {
1215                                 fsec->fsec_owner = kauth_null_guid;
1216                         }
1217                         if (VATTR_IS_SUPPORTED(&va, va_guuid)) {
1218                                 fsec->fsec_group = va.va_guuid;
1219                         } else {
1220                                 fsec->fsec_group = kauth_null_guid;
1221                         }
1222                         if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) {
1223                                 bcopy(va.va_acl, &(fsec->fsec_acl), KAUTH_ACL_COPYSIZE(va.va_acl));
1224                         } else {
1225                                 fsec->fsec_acl.acl_entrycount = KAUTH_FILESEC_NOACL;
1226                         }
1227                         *xsec = fsec;
1228                 }
1229         }
1230
1231         /* Do not give the generation number out to unpriviledged users */
1232         if (va.va_gen && !vfs_context_issuser(ctx)) {
1233                 if (isstat64 != 0)
1234                         sb64->st_gen = 0;
1235                 else
1236                         sb->st_gen = 0;
1237         } else {
1238                 if (isstat64 != 0)
1239                         sb64->st_gen = va.va_gen;
1240                 else
1241                         sb->st_gen = va.va_gen;
1242         }
1243
1244         error = 0;
1245 out:
1246         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL)
1247                 kauth_acl_free(va.va_acl);
1248         return (error);
1249 }
1250
1251 int
1252 vn_stat(struct vnode *vp, void *sb, kauth_filesec_t *xsec, int isstat64, vfs_context_t ctx)
1253 {
1254         int error;
1255
1256 #if CONFIG_MACF
1257         error = mac_vnode_check_stat(ctx, NOCRED, vp);
1258         if (error)
1259                 return (error);
1260 #endif
1261
1262         /* authorize */
1263         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_ATTRIBUTES | KAUTH_VNODE_READ_SECURITY, ctx)) != 0)
1264                 return(error);
1265
1266         /* actual stat */
1267         return(vn_stat_noauth(vp, sb, xsec, isstat64, ctx));
1268 }
1269
1270
1271 /*
1272  * File table vnode ioctl routine.
1273  */
1274 static int
1275 vn_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx)
1276 {
1277         struct vnode *vp = ((struct vnode *)fp->f_fglob->fg_data);
1278         off_t file_size;
1279         int error;
1280         struct vnode *ttyvp;
1281         int funnel_state;
1282         struct session * sessp;
1283
1284         if ( (error = vnode_getwithref(vp)) ) {
1285                 return(error);
1286         }
1287
1288 #if CONFIG_MACF
1289         error = mac_vnode_check_ioctl(ctx, vp, com);
1290         if (error)
1291                 goto out;
1292 #endif
1293
1294         switch (vp->v_type) {
1295         case VREG:
1296         case VDIR:
1297                 if (com == FIONREAD) {
1298                         if ((error = vnode_size(vp, &file_size, ctx)) != 0)
1299                                 goto out;
1300                         *(int *)data = file_size - fp->f_fglob->fg_offset;
1301                         goto out;
1302                 }
1303                 if (com == FIONBIO || com == FIOASYNC) {        /* XXX */
1304                         goto out;
1305                 }
1306                 /* fall into ... */
1307
1308         default:
1309                 error = ENOTTY;
1310                 goto out;
1311
1312         case VFIFO:
1313         case VCHR:
1314         case VBLK:
1315
1316                 /* Should not be able to set block size from user space */
1317                 if (com == DKIOCSETBLOCKSIZE) {
1318                         error = EPERM;
1319                         goto out;
1320                 }
1321
1322                 if (com == FIODTYPE) {
1323                         if (vp->v_type == VBLK) {
1324                                 if (major(vp->v_rdev) >= nblkdev) {
1325                                         error = ENXIO;
1326                                         goto out;
1327                                 }
1328                                 *(int *)data = D_TYPEMASK & bdevsw[major(vp->v_rdev)].d_type;
1329
1330                         } else if (vp->v_type == VCHR) {
1331                                 if (major(vp->v_rdev) >= nchrdev) {
1332                                         error = ENXIO;
1333                                         goto out;
1334                                 }
1335                                 *(int *)data = D_TYPEMASK & cdevsw[major(vp->v_rdev)].d_type;
1336                         } else {
1337                                 error = ENOTTY;
1338                                 goto out;
1339                         }
1340                         goto out;
1341                 }
1342                 error = VNOP_IOCTL(vp, com, data, fp->f_fglob->fg_flag, ctx);
1343
1344                 if (error == 0 && com == TIOCSCTTY) {
1345                         error = vnode_ref_ext(vp, 0, VNODE_REF_FORCE);
1346                         if (error != 0) {
1347                                 panic("vnode_ref_ext() failed despite VNODE_REF_FORCE?!");
1348                         }
1349
1350                         funnel_state = thread_funnel_set(kernel_flock, TRUE);
1351                         sessp = proc_session(vfs_context_proc(ctx));
1352
1353                         session_lock(sessp);
1354                         ttyvp = sessp->s_ttyvp;
1355                         sessp->s_ttyvp = vp;
1356                         sessp->s_ttyvid = vnode_vid(vp);
1357                         session_unlock(sessp);
1358                         session_rele(sessp);
1359                         thread_funnel_set(kernel_flock, funnel_state);
1360
1361                         if (ttyvp)
1362                                 vnode_rele(ttyvp);
1363                 }
1364         }
1365 out:
1366         (void)vnode_put(vp);
1367         return(error);
1368 }
1369
1370 /*
1371  * File table vnode select routine.
1372  */
1373 static int
1374 vn_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx)
1375 {
1376         int error;
1377         struct vnode * vp = (struct vnode *)fp->f_fglob->fg_data;
1378         struct vfs_context context;
1379
1380         if ( (error = vnode_getwithref(vp)) == 0 ) {
1381                 context.vc_thread = current_thread();
1382                 context.vc_ucred = fp->f_fglob->fg_cred;
1383
1384 #if CONFIG_MACF
1385                 /*
1386                  * XXX We should use a per thread credential here; minimally,
1387                  * XXX the process credential should have a persistent
1388                  * XXX reference on it before being passed in here.
1389                  */
1390                 error = mac_vnode_check_select(ctx, vp, which);
1391                 if (error == 0)
1392 #endif
1393                 error = VNOP_SELECT(vp, which, fp->f_fglob->fg_flag, wql, ctx);
1394
1395                 (void)vnode_put(vp);
1396         }
1397         return(error);
1398
1399 }
1400
1401 /*
1402  * File table vnode close routine.
1403  */
1404 static int
1405 vn_closefile(struct fileglob *fg, vfs_context_t ctx)
1406 {
1407         struct vnode *vp = (struct vnode *)fg->fg_data;
1408         int error;
1409         struct flock lf;
1410
1411         if ( (error = vnode_getwithref(vp)) == 0 ) {
1412
1413                 if ((fg->fg_flag & FHASLOCK) && fg->fg_type == DTYPE_VNODE) {
1414                         lf.l_whence = SEEK_SET;
1415                         lf.l_start = 0;
1416                         lf.l_len = 0;
1417                         lf.l_type = F_UNLCK;
1418
1419                         (void)VNOP_ADVLOCK(vp, (caddr_t)fg, F_UNLCK, &lf, F_FLOCK, ctx);
1420                 }
1421                 error = vn_close(vp, fg->fg_flag, ctx);
1422
1423                 (void)vnode_put(vp);
1424         }
1425         return(error);
1426 }
1427
1428 /*
1429  * Returns:     0                       Success
1430  *      VNOP_PATHCONF:???
1431  */
1432 int
1433 vn_pathconf(vnode_t vp, int name, int32_t *retval, vfs_context_t ctx)
1434 {
1435         int     error = 0;
1436         struct vfs_attr vfa;
1437
1438         switch(name) {
1439         case _PC_EXTENDED_SECURITY_NP:
1440                 *retval = vfs_extendedsecurity(vnode_mount(vp)) ? 1 : 0;
1441                 break;
1442         case _PC_AUTH_OPAQUE_NP:
1443                 *retval = vfs_authopaque(vnode_mount(vp));
1444                 break;
1445         case _PC_2_SYMLINKS:
1446                 *retval = 1;    /* XXX NOTSUP on MSDOS, etc. */
1447                 break;
1448         case _PC_ALLOC_SIZE_MIN:
1449                 *retval = 1;    /* XXX lie: 1 byte */
1450                 break;
1451         case _PC_ASYNC_IO:      /* unistd.h: _POSIX_ASYNCHRONUS_IO */
1452                 *retval = 1;    /* [AIO] option is supported */
1453                 break;
1454         case _PC_PRIO_IO:       /* unistd.h: _POSIX_PRIORITIZED_IO */
1455                 *retval = 0;    /* [PIO] option is not supported */
1456                 break;
1457         case _PC_REC_INCR_XFER_SIZE:
1458                 *retval = 4096; /* XXX go from MIN to MAX 4K at a time */
1459                 break;
1460         case _PC_REC_MIN_XFER_SIZE:
1461                 *retval = 4096; /* XXX recommend 4K minimum reads/writes */
1462                 break;
1463         case _PC_REC_MAX_XFER_SIZE:
1464                 *retval = 65536; /* XXX recommend 64K maximum reads/writes */
1465                 break;
1466         case _PC_REC_XFER_ALIGN:
1467                 *retval = 4096; /* XXX recommend page aligned buffers */
1468                 break;
1469         case _PC_SYMLINK_MAX:
1470                 *retval = 255;  /* Minimum acceptable POSIX value */
1471                 break;
1472         case _PC_SYNC_IO:       /* unistd.h: _POSIX_SYNCHRONIZED_IO */
1473                 *retval = 0;    /* [SIO] option is not supported */
1474                 break;
1475         case _PC_XATTR_SIZE_BITS:
1476                 /* The number of bits used to store maximum extended
1477                  * attribute size in bytes.  For example, if the maximum
1478                  * attribute size supported by a file system is 128K, the
1479                  * value returned will be 18.  However a value 18 can mean
1480                  * that the maximum attribute size can be anywhere from
1481                  * (256KB - 1) to 128KB.  As a special case, the resource
1482                  * fork can have much larger size, and some file system
1483                  * specific extended attributes can have smaller and preset
1484                  * size; for example, Finder Info is always 32 bytes.
1485                  */
1486                 memset(&vfa, 0, sizeof(vfa));
1487                 VFSATTR_INIT(&vfa);
1488                 VFSATTR_WANTED(&vfa, f_capabilities);
1489                 if (vfs_getattr(vnode_mount(vp), &vfa, ctx) == 0 &&
1490                     (VFSATTR_IS_SUPPORTED(&vfa, f_capabilities)) &&
1491                     (vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1492                     (vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1493                         /* Supports native extended attributes */
1494                         error = VNOP_PATHCONF(vp, name, retval, ctx);
1495                 } else {
1496                         /* Number of bits used to represent the maximum size of
1497                          * extended attribute stored in an Apple Double file.
1498                          */
1499                         *retval = AD_XATTR_SIZE_BITS;
1500                 }
1501                 break;
1502         default:
1503                 error = VNOP_PATHCONF(vp, name, retval, ctx);
1504                 break;
1505         }
1506
1507         return (error);
1508 }
1509
1510 static int
1511 vn_kqfilt_add(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
1512 {
1513         int error;
1514         struct vnode *vp;
1515
1516         vp = (struct vnode *)fp->f_fglob->fg_data;
1517
1518         /*
1519          * Don't attach a knote to a dead vnode.
1520          */
1521         if ((error = vget_internal(vp, 0, VNODE_NODEAD)) == 0) {
1522                 switch (kn->kn_filter) {
1523                         case EVFILT_READ:
1524                         case EVFILT_WRITE:
1525                                 if (vnode_isfifo(vp)) {
1526                                         /* We'll only watch FIFOs that use our fifofs */
1527                                         if (!(vp->v_fifoinfo && vp->v_fifoinfo->fi_readsock)) {
1528                                                 error = ENOTSUP;
1529                                         }
1530
1531                                 } else if (!vnode_isreg(vp)) {
1532                                         if (vnode_ischr(vp) &&
1533                                                         (error = spec_kqfilter(vp, kn)) == 0) {
1534                                                 /* claimed by a special device */
1535                                                 vnode_put(vp);
1536                                                 return 0;
1537                                         }
1538
1539                                         error = EINVAL;
1540                                 }
1541                                 break;
1542                         case EVFILT_VNODE:
1543                                 break;
1544                         default:
1545                                 error = EINVAL;
1546                 }
1547
1548                 if (error) {
1549                         vnode_put(vp);
1550                         return error;
1551                 }
1552
1553 #if CONFIG_MACF
1554                 error = mac_vnode_check_kqfilter(ctx, fp->f_fglob->fg_cred, kn, vp);
1555                 if (error) {
1556                         vnode_put(vp);
1557                         return error;
1558                 }
1559 #endif
1560
1561                 kn->kn_hook = (void*)vp;
1562                 kn->kn_hookid = vnode_vid(vp);
1563                 kn->kn_fop = &vnode_filtops;
1564
1565                 vnode_lock(vp);
1566                 KNOTE_ATTACH(&vp->v_knotes, kn);
1567                 vnode_unlock(vp);
1568
1569                 /* Ask the filesystem to provide remove notifications, but ignore failure */
1570                 VNOP_MONITOR(vp, 0, VNODE_MONITOR_BEGIN, (void*) kn,  ctx);
1571
1572                 vnode_put(vp);
1573         }
1574
1575         return (error);
1576 }
1577
1578 static void
1579 filt_vndetach(struct knote *kn)
1580 {
1581         vfs_context_t ctx = vfs_context_current();
1582         struct vnode *vp;
1583         vp = (struct vnode *)kn->kn_hook;
1584         if (vnode_getwithvid(vp, kn->kn_hookid))
1585                 return;
1586
1587         vnode_lock(vp);
1588         KNOTE_DETACH(&vp->v_knotes, kn);
1589         vnode_unlock(vp);
1590
1591         /*
1592          * Tell a (generally networked) filesystem that we're no longer watching
1593          * If the FS wants to track contexts, it should still be using the one from
1594          * the VNODE_MONITOR_BEGIN.
1595          */
1596         VNOP_MONITOR(vp, 0, VNODE_MONITOR_END, (void*)kn, ctx);
1597         vnode_put(vp);
1598 }
1599
1600
1601 /*
1602  * Used for EVFILT_READ
1603  *
1604  * Takes only VFIFO or VREG. vnode is locked.  We handle the "poll" case
1605  * differently than the regular case for VREG files.  If not in poll(),
1606  * then we need to know current fileproc offset for VREG.
1607  */
1608 static intptr_t
1609 vnode_readable_data_count(vnode_t vp, off_t current_offset, int ispoll)
1610 {
1611         if (vnode_isfifo(vp)) {
1612                 int cnt;
1613                 int err = fifo_charcount(vp, &cnt);
1614                 if (err == 0) {
1615                         return (intptr_t)cnt;
1616                 } else {
1617                         return (intptr_t)0;
1618                 }
1619         } else if (vnode_isreg(vp)) {
1620                 if (ispoll) {
1621                         return (intptr_t)1;
1622                 }
1623
1624                 off_t amount;
1625                 amount = vp->v_un.vu_ubcinfo->ui_size - current_offset;
1626                 if (amount > (off_t)INTPTR_MAX) {
1627                         return INTPTR_MAX;
1628                 } else if (amount < (off_t)INTPTR_MIN) {
1629                         return INTPTR_MIN;
1630                 } else {
1631                         return (intptr_t)amount;
1632                 }
1633         } else {
1634                 panic("Should never have an EVFILT_READ except for reg or fifo.");
1635                 return 0;
1636         }
1637 }
1638
1639 /*
1640  * Used for EVFILT_WRITE.
1641  *
1642  * For regular vnodes, we can always write (1).  For named pipes,
1643  * see how much space there is in the buffer.  Nothing else is covered.
1644  */
1645 static intptr_t
1646 vnode_writable_space_count(vnode_t vp)
1647 {
1648         if (vnode_isfifo(vp)) {
1649                 long spc;
1650                 int err = fifo_freespace(vp, &spc);
1651                 if (err == 0) {
1652                         return (intptr_t)spc;
1653                 } else {
1654                         return (intptr_t)0;
1655                 }
1656         } else if (vnode_isreg(vp)) {
1657                 return (intptr_t)1;
1658         } else {
1659                 panic("Should never have an EVFILT_READ except for reg or fifo.");
1660                 return 0;
1661         }
1662 }
1663
1664 /*
1665  * Determine whether this knote should be active
1666  *
1667  * This is kind of subtle.
1668  *      --First, notice if the vnode has been revoked: in so, override hint
1669  *      --EVFILT_READ knotes are checked no matter what the hint is
1670  *      --Other knotes activate based on hint.
1671  *      --If hint is revoke, set special flags and activate
1672  */
1673 static int
1674 filt_vnode(struct knote *kn, long hint)
1675 {
1676         vnode_t vp = (struct vnode *)kn->kn_hook;
1677         int activate = 0;
1678         long orig_hint = hint;
1679
1680         if (0 == hint) {
1681                 vnode_lock(vp);
1682
1683                 if (vnode_getiocount(vp, kn->kn_hookid, VNODE_NODEAD | VNODE_WITHID) != 0) {
1684                         /* Is recycled */
1685                         hint = NOTE_REVOKE;
1686                 }
1687         } else {
1688                 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1689         }
1690
1691         /* Special handling for vnodes that are in recycle or already gone */
1692         if (NOTE_REVOKE == hint) {
1693                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1694                 activate = 1;
1695
1696                 if ((kn->kn_filter == EVFILT_VNODE) && (kn->kn_sfflags & NOTE_REVOKE)) {
1697                         kn->kn_fflags |= NOTE_REVOKE;
1698                 }
1699         } else {
1700                 switch(kn->kn_filter) {
1701                         case EVFILT_READ:
1702                                 kn->kn_data = vnode_readable_data_count(vp, kn->kn_fp->f_fglob->fg_offset, (kn->kn_flags & EV_POLL));
1703
1704                                 if (kn->kn_data != 0) {
1705                                         activate = 1;
1706                                 }
1707                                 break;
1708                         case EVFILT_WRITE:
1709                                 kn->kn_data = vnode_writable_space_count(vp);
1710
1711                                 if (kn->kn_data != 0) {
1712                                         activate = 1;
1713                                 }
1714                                 break;
1715                         case EVFILT_VNODE:
1716                                 /* Check events this note matches against the hint */
1717                                 if (kn->kn_sfflags & hint) {
1718                                         kn->kn_fflags |= hint; /* Set which event occurred */
1719                                 }
1720                                 if (kn->kn_fflags != 0) {
1721                                         activate = 1;
1722                                 }
1723                                 break;
1724                         default:
1725                                 panic("Invalid knote filter on a vnode!\n");
1726                 }
1727         }
1728
1729         if (orig_hint == 0) {
1730                 /*
1731                  * Definitely need to unlock, may need to put
1732                  */
1733                 if (hint == 0) {
1734                         vnode_put_locked(vp);
1735                 }
1736                 vnode_unlock(vp);
1737         }
1738
1739         return (activate);
1740 }