+ lck_mtx_lock_spin(&fg->fg_lock);
+ while (fg->fg_lflags & FG_OFF_LOCKED) {
+ fg->fg_lflags |= FG_OFF_LOCKWANT;
+ msleep(&fg->fg_lflags, &fg->fg_lock, PVFS | PSPIN,
+ "fg_offset_lock_wait", 0);
+ }
+ fg->fg_lflags |= FG_OFF_LOCKED;
+ lck_mtx_unlock(&fg->fg_lock);
+}
+
+static inline void
+vn_offset_unlock(struct fileglob *fg)
+{
+ int lock_wanted = 0;
+
+ lck_mtx_lock_spin(&fg->fg_lock);
+ if (fg->fg_lflags & FG_OFF_LOCKWANT) {
+ lock_wanted = 1;
+ }
+ fg->fg_lflags &= ~(FG_OFF_LOCKED | FG_OFF_LOCKWANT);
+ lck_mtx_unlock(&fg->fg_lock);
+ if (lock_wanted) {
+ wakeup(&fg->fg_lflags);
+ }
+}
+
+/*
+ * File table vnode read routine.
+ */
+static int
+vn_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx)
+{
+ struct vnode *vp;
+ int error;
+ int ioflag;
+ off_t read_offset;
+ user_ssize_t read_len;
+ user_ssize_t adjusted_read_len;
+ user_ssize_t clippedsize;
+ bool offset_locked;
+
+ read_len = uio_resid(uio);
+ if (read_len < 0 || read_len > INT_MAX) {
+ return EINVAL;
+ }
+ adjusted_read_len = read_len;
+ clippedsize = 0;
+ offset_locked = false;
+
+ vp = (struct vnode *)fp->fp_glob->fg_data;
+ if ((error = vnode_getwithref(vp))) {
+ return error;
+ }
+
+#if CONFIG_MACF
+ error = mac_vnode_check_read(ctx, vfs_context_ucred(ctx), vp);
+ if (error) {
+ (void)vnode_put(vp);
+ return error;
+ }
+#endif
+
+ /* This signals to VNOP handlers that this read came from a file table read */
+ ioflag = IO_SYSCALL_DISPATCH;
+
+ if (fp->fp_glob->fg_flag & FNONBLOCK) {
+ ioflag |= IO_NDELAY;
+ }
+ if ((fp->fp_glob->fg_flag & FNOCACHE) || vnode_isnocache(vp)) {
+ ioflag |= IO_NOCACHE;
+ }
+ if (fp->fp_glob->fg_flag & FENCRYPTED) {
+ ioflag |= IO_ENCRYPTED;
+ }
+ if (fp->fp_glob->fg_flag & FUNENCRYPTED) {
+ ioflag |= IO_SKIP_ENCRYPTION;
+ }
+ if (fp->fp_glob->fg_flag & O_EVTONLY) {
+ ioflag |= IO_EVTONLY;
+ }
+ if (fp->fp_glob->fg_flag & FNORDAHEAD) {
+ ioflag |= IO_RAOFF;
+ }
+
+ if ((flags & FOF_OFFSET) == 0) {
+ if ((vnode_vtype(vp) == VREG) && !vnode_isswap(vp)) {
+ vn_offset_lock(fp->fp_glob);
+ offset_locked = true;
+ }
+ read_offset = fp->fp_glob->fg_offset;
+ uio_setoffset(uio, read_offset);
+ } else {
+ read_offset = uio_offset(uio);
+ /* POSIX allows negative offsets for character devices. */
+ if ((read_offset < 0) && (vnode_vtype(vp) != VCHR)) {
+ error = EINVAL;
+ goto error_out;
+ }
+ }
+
+ if (read_offset == INT64_MAX) {
+ /* can't read any more */
+ error = 0;
+ goto error_out;
+ }
+
+ /*
+ * If offset + len will cause overflow, reduce the len to a value
+ * (adjusted_read_len) where it won't
+ */
+ if ((read_offset >= 0) && (INT64_MAX - read_offset) < read_len) {
+ /*
+ * 0 read_offset INT64_MAX
+ * |-----------------------------------------------|----------|~~~
+ * <--read_len-->
+ * <-adjusted->
+ */
+ adjusted_read_len = (user_ssize_t)(INT64_MAX - read_offset);
+ }
+
+ if (adjusted_read_len < read_len) {
+ uio_setresid(uio, adjusted_read_len);
+ clippedsize = read_len - adjusted_read_len;
+ }
+
+ if (vnode_isswap(vp) && !(IO_SKIP_ENCRYPTION & ioflag)) {
+ /* special case for swap files */
+ error = vn_read_swapfile(vp, uio);
+ } else {
+ error = VNOP_READ(vp, uio, ioflag, ctx);
+ }
+
+ if (clippedsize) {
+ uio_setresid(uio, (uio_resid(uio) + clippedsize));
+ }
+
+ if ((flags & FOF_OFFSET) == 0) {
+ fp->fp_glob->fg_offset += read_len - uio_resid(uio);
+ }
+
+error_out:
+ if (offset_locked) {
+ vn_offset_unlock(fp->fp_glob);
+ offset_locked = false;
+ }
+
+ (void)vnode_put(vp);
+ return error;
+}
+
+
+/*
+ * File table vnode write routine.
+ */
+static int
+vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx)
+{
+ struct vnode *vp;
+ int error, ioflag;
+ off_t write_offset;
+ off_t write_end_offset;
+ user_ssize_t write_len;
+ user_ssize_t adjusted_write_len;
+ user_ssize_t clippedsize;
+ bool offset_locked;
+ proc_t p = vfs_context_proc(ctx);
+ rlim_t rlim_cur_fsize = p ? proc_limitgetcur(p, RLIMIT_FSIZE, TRUE) : 0;
+
+ write_len = uio_resid(uio);
+ if (write_len < 0 || write_len > INT_MAX) {
+ return EINVAL;
+ }
+ adjusted_write_len = write_len;
+ clippedsize = 0;
+ offset_locked = false;
+
+ vp = (struct vnode *)fp->fp_glob->fg_data;
+ if ((error = vnode_getwithref(vp))) {
+ return error;
+ }
+
+#if CONFIG_MACF
+ error = mac_vnode_check_write(ctx, vfs_context_ucred(ctx), vp);
+ if (error) {
+ (void)vnode_put(vp);
+ return error;
+ }
+#endif
+
+ /*
+ * IO_SYSCALL_DISPATCH signals to VNOP handlers that this write came from
+ * a file table write
+ */
+ ioflag = (IO_UNIT | IO_SYSCALL_DISPATCH);
+
+ if (vp->v_type == VREG && (fp->fp_glob->fg_flag & O_APPEND)) {
+ ioflag |= IO_APPEND;
+ }
+ if (fp->fp_glob->fg_flag & FNONBLOCK) {
+ ioflag |= IO_NDELAY;
+ }
+ if ((fp->fp_glob->fg_flag & FNOCACHE) || vnode_isnocache(vp)) {
+ ioflag |= IO_NOCACHE;
+ }
+ if (fp->fp_glob->fg_flag & FNODIRECT) {
+ ioflag |= IO_NODIRECT;
+ }
+ if (fp->fp_glob->fg_flag & FSINGLE_WRITER) {
+ ioflag |= IO_SINGLE_WRITER;
+ }
+ if (fp->fp_glob->fg_flag & O_EVTONLY) {
+ ioflag |= IO_EVTONLY;
+ }
+
+ /*
+ * Treat synchronous mounts and O_FSYNC on the fd as equivalent.
+ *
+ * XXX We treat O_DSYNC as O_FSYNC for now, since we can not delay
+ * XXX the non-essential metadata without some additional VFS work;
+ * XXX the intent at this point is to plumb the interface for it.
+ */
+ if ((fp->fp_glob->fg_flag & (O_FSYNC | O_DSYNC)) ||
+ (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) {
+ ioflag |= IO_SYNC;
+ }
+
+ if ((flags & FOF_OFFSET) == 0) {
+ if ((vnode_vtype(vp) == VREG) && !vnode_isswap(vp)) {
+ vn_offset_lock(fp->fp_glob);
+ offset_locked = true;
+ }
+ write_offset = fp->fp_glob->fg_offset;
+ uio_setoffset(uio, write_offset);
+ } else {
+ /* for pwrite, append should be ignored */
+ ioflag &= ~IO_APPEND;
+ write_offset = uio_offset(uio);
+ /* POSIX allows negative offsets for character devices. */
+ if ((write_offset < 0) && (vnode_vtype(vp) != VCHR)) {
+ error = EINVAL;
+ goto error_out;
+ }
+ }
+
+ if (write_offset == INT64_MAX) {
+ /* writes are not possible */
+ error = EFBIG;
+ goto error_out;
+ }
+
+ /*
+ * write_len is the original write length that was requested.
+ * We may however need to reduce that becasue of two reasons
+ *
+ * 1) If write_offset + write_len will exceed OFF_T_MAX (i.e. INT64_MAX)
+ * and/or
+ * 2) If write_offset + write_len will exceed the administrative
+ * limit for the maximum file size.
+ *
+ * In both cases the write will be denied if we can't write even a single
+ * byte otherwise it will be "clipped" (i.e. a short write).
+ */
+
+ /*
+ * If offset + len will cause overflow, reduce the len
+ * to a value (adjusted_write_len) where it won't
+ */
+ if ((write_offset >= 0) && (INT64_MAX - write_offset) < write_len) {
+ /*
+ * 0 write_offset INT64_MAX
+ * |-----------------------------------------------|----------|~~~
+ * <--write_len-->
+ * <-adjusted->
+ */
+ adjusted_write_len = (user_ssize_t)(INT64_MAX - write_offset);
+ }
+
+ /* write_end_offset will always be [0, INT64_MAX] */
+ write_end_offset = write_offset + adjusted_write_len;
+
+ if (p && (vp->v_type == VREG) &&
+ (rlim_cur_fsize != RLIM_INFINITY) &&
+ (rlim_cur_fsize <= INT64_MAX) &&
+ (write_end_offset > (off_t)rlim_cur_fsize)) {
+ /*
+ * If the requested residual would cause us to go past the
+ * administrative limit, then we need to adjust the residual
+ * down to cause fewer bytes than requested to be written. If
+ * we can't do that (e.g. the residual is already 1 byte),
+ * then we fail the write with EFBIG.
+ */
+ if (write_offset >= (off_t)rlim_cur_fsize) {
+ /*
+ * 0 rlim_fsize write_offset write_end INT64_MAX
+ * |------------------------|----------|-------------|--------|
+ * <--write_len-->
+ *
+ * write not permitted
+ */
+ psignal(p, SIGXFSZ);
+ error = EFBIG;
+ goto error_out;
+ }
+
+ /*
+ * 0 write_offset rlim_fsize write_end INT64_MAX
+ * |------------------------|-----------|---------|------------|
+ * <------write_len------>
+ * <-adjusted-->
+ */
+ adjusted_write_len = (user_ssize_t)((off_t)rlim_cur_fsize - write_offset);
+ assert((adjusted_write_len > 0) && (adjusted_write_len < write_len));
+ }
+
+ if (adjusted_write_len < write_len) {
+ uio_setresid(uio, adjusted_write_len);
+ clippedsize = write_len - adjusted_write_len;
+ }
+
+ error = VNOP_WRITE(vp, uio, ioflag, ctx);
+
+ /*
+ * If we had to reduce the size of write requested either because
+ * of rlimit or because it would have exceeded
+ * maximum file size, we have to add that back to the residual so
+ * it correctly reflects what we did in this function.
+ */
+ if (clippedsize) {
+ uio_setresid(uio, (uio_resid(uio) + clippedsize));
+ }
+
+ if ((flags & FOF_OFFSET) == 0) {
+ if (ioflag & IO_APPEND) {
+ fp->fp_glob->fg_offset = uio_offset(uio);
+ } else {
+ fp->fp_glob->fg_offset += (write_len - uio_resid(uio));
+ }
+ if (offset_locked) {
+ vn_offset_unlock(fp->fp_glob);
+ offset_locked = false;
+ }
+ }
+
+ /*
+ * Set the credentials on successful writes
+ */
+ if ((error == 0) && (vp->v_tag == VT_NFS) && (UBCINFOEXISTS(vp))) {
+ /*
+ * When called from aio subsystem, we only have the proc from
+ * which to get the credential, at this point, so use that
+ * instead. This means aio functions are incompatible with
+ * per-thread credentials (aio operations are proxied). We
+ * can't easily correct the aio vs. settid race in this case
+ * anyway, so we disallow it.
+ */
+ if ((flags & FOF_PCRED) == 0) {
+ ubc_setthreadcred(vp, p, current_thread());
+ } else {
+ ubc_setcred(vp, p);
+ }
+ }
+ (void)vnode_put(vp);
+ return error;
+
+error_out:
+ if (offset_locked) {
+ vn_offset_unlock(fp->fp_glob);
+ }
+ (void)vnode_put(vp);
+ return error;
+}
+
+/*
+ * File table vnode stat routine.
+ *
+ * Returns: 0 Success
+ * EBADF
+ * ENOMEM
+ * vnode_getattr:???
+ */
+int
+vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat64,
+ int needsrealdev, vfs_context_t ctx, struct ucred *file_cred)
+{
+ struct vnode_attr va;
+ int error;
+ u_short mode;
+ kauth_filesec_t fsec;
+ struct stat *sb = (struct stat *)0; /* warning avoidance ; protected by isstat64 */
+ struct stat64 * sb64 = (struct stat64 *)0; /* warning avoidance ; protected by isstat64 */