/*
- * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <sys/filedesc.h>
#include <sys/stat.h>
#include <sys/buf.h>
+#include <sys/buf_internal.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/vnode.h>
#include <sys/disk.h>
#include <sys/sysctl.h>
#include <sys/fsctl.h>
+#include <sys/mount_internal.h>
#include <miscfs/specfs/specdev.h>
int
hfs_vnop_read(struct vnop_read_args *ap)
{
+ /*
+ struct vnop_read_args {
+ struct vnodeop_desc *a_desc;
+ vnode_t a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ vfs_context_t a_context;
+ };
+ */
+
uio_t uio = ap->a_uio;
struct vnode *vp = ap->a_vp;
struct cnode *cp;
off_t offset = uio_offset(uio);
int retval = 0;
int took_truncate_lock = 0;
+ int io_throttle = 0;
/* Preflight checks */
if (!vnode_isreg(vp)) {
}
/* otherwise the file was converted back to a regular file while we were reading it */
retval = 0;
- } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) {
+ } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
int error;
error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
hfsmp = VTOHFS(vp);
#if CONFIG_PROTECT
- if ((retval = cp_handle_vnop (cp, CP_READ_ACCESS)) != 0) {
+ if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
goto exit;
}
#endif
+ /*
+ * If this read request originated from a syscall (as opposed to
+ * an in-kernel page fault or something), then set it up for
+ * throttle checks. For example, large EAs may cause a VNOP_READ
+ * to occur, and we wouldn't want to throttle I/O while holding the
+ * EA B-Tree lock.
+ */
+ if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
+ io_throttle = IO_RETURN_ON_THROTTLE;
+ }
+
+read_again:
+
/* Protect against a size change. */
hfs_lock_truncate(cp, HFS_SHARED_LOCK);
took_truncate_lock = 1;
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
(int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
- retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
+ retval = cluster_read(vp, uio, filesize, ap->a_ioflag | (io_throttle));
cp->c_touch_acctime = TRUE;
if (took_truncate_lock) {
hfs_unlock_truncate(cp, 0);
}
+ if (retval == EAGAIN) {
+ throttle_lowpri_io(1);
+ retval = 0;
+ goto read_again;
+ }
return (retval);
}
int do_snapshot = 1;
time_t orig_ctime=VTOC(vp)->c_ctime;
int took_truncate_lock = 0;
+ int io_return_on_throttle = 0;
struct rl_entry *invalid_range;
#if HFS_COMPRESSION
printf("invalid state %d for compressed file\n", state);
/* fall through */
}
- } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) {
+ } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
int error;
error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
hfsmp = VTOHFS(vp);
#if CONFIG_PROTECT
- if ((retval = cp_handle_vnop (cp, CP_WRITE_ACCESS)) != 0) {
+ if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
goto exit;
}
#endif
}
#endif /* HFS_SPARSE_DEV */
+ if ((ioflag & (IO_SINGLE_WRITER | IO_RETURN_ON_THROTTLE)) ==
+ (IO_SINGLE_WRITER | IO_RETURN_ON_THROTTLE)) {
+ io_return_on_throttle = IO_RETURN_ON_THROTTLE;
+ }
again:
/* Protect against a size change. */
/*
uio_setoffset(uio, fp->ff_size);
offset = fp->ff_size;
}
- if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
+ if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
retval = EPERM;
goto exit;
}
ubc_setsize(vp, filesize);
}
retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
- tail_off, lflag | IO_NOZERODIRTY);
+ tail_off, lflag | IO_NOZERODIRTY | io_return_on_throttle);
if (retval) {
fp->ff_new_size = 0; /* no longer extending; use ff_size */
+
+ if (retval == EAGAIN) {
+ /*
+ * EAGAIN indicates that we still have I/O to do, but
+ * that we now need to be throttled
+ */
+ if (resid != uio_resid(uio)) {
+ /*
+ * did manage to do some I/O before returning EAGAIN
+ */
+ resid = uio_resid(uio);
+ offset = uio_offset(uio);
+
+ cp->c_touch_chgtime = TRUE;
+ cp->c_touch_modtime = TRUE;
+ }
+ if (filesize > fp->ff_size) {
+ /*
+ * we called ubc_setsize before the call to
+ * cluster_write... since we only partially
+ * completed the I/O, we need to
+ * re-adjust our idea of the filesize based
+ * on our interim EOF
+ */
+ ubc_setsize(vp, offset);
+
+ fp->ff_size = offset;
+ }
+ goto exit;
+ }
if (filesize > origFileSize) {
ubc_setsize(vp, origFileSize);
}
if (took_truncate_lock) {
hfs_unlock_truncate(cp, 0);
}
+ if (retval == EAGAIN) {
+ throttle_lowpri_io(1);
+
+ retval = 0;
+ goto again;
+ }
return (retval);
}
} else {
int lockflags;
+ if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
+ throttle_lowpri_io(1);
+
lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
-
+
/* lookup this cnid in the catalog */
error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
boolean_t is64bit;
/*
- * NOTE: on entry, the vnode is locked. Incase this vnode
+ * NOTE: on entry, the vnode has an io_ref. In case this vnode
* happens to be in our list of file_ids, we'll note it
* avoid calling hfs_chashget_nowait() on that id as that
* will cause a "locking against myself" panic.
access[i] = 0;
continue;
}
-
+
myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
#if CONFIG_PROTECT
{
int error = 0;
- if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) {
+ if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
return error;
}
}
}
#endif /* HFS_SPARSE_DEV */
+ /* Change the next CNID stored in the VH */
+ case HFS_CHANGE_NEXTCNID: {
+ int error = 0; /* Assume success */
+ u_int32_t fileid;
+ int wraparound = 0;
+ int lockflags = 0;
+
+ if (vnode_vfsisrdonly(vp)) {
+ return (EROFS);
+ }
+ vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+ if (suser(cred, NULL) &&
+ kauth_cred_getuid(cred) != vfsp->f_owner) {
+ return (EACCES); /* must be owner of file system */
+ }
+
+ fileid = *(u_int32_t *)ap->a_data;
+
+ /* Must have catalog lock excl. to advance the CNID pointer */
+ lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
+
+ HFS_MOUNT_LOCK(hfsmp, TRUE);
+
+ /* If it is less than the current next CNID, force the wraparound bit to be set */
+ if (fileid < hfsmp->vcbNxtCNID) {
+ wraparound=1;
+ }
+
+ /* Return previous value. */
+ *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
+
+ hfsmp->vcbNxtCNID = fileid;
+
+ if (wraparound) {
+ hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
+ }
+
+ MarkVCBDirty(hfsmp);
+ HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+ hfs_systemfile_unlock (hfsmp, lockflags);
+
+ return (error);
+ }
+
case F_FREEZE_FS: {
struct mount *mp;
return (EINVAL);
}
+ case F_SETSTATICCONTENT: {
+ int error;
+ int enable_static = 0;
+ struct cnode *cp = NULL;
+ /*
+ * lock the cnode, decorate the cnode flag, and bail out.
+ * VFS should have already authenticated the caller for us.
+ */
+
+ if (ap->a_data) {
+ /*
+ * Note that even though ap->a_data is of type caddr_t,
+ * the fcntl layer at the syscall handler will pass in NULL
+ * or 1 depending on what the argument supplied to the fcntl
+ * was. So it is in fact correct to check the ap->a_data
+ * argument for zero or non-zero value when deciding whether or not
+ * to enable the static bit in the cnode.
+ */
+ enable_static = 1;
+ }
+ if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+ return EROFS;
+ }
+ cp = VTOC(vp);
+
+ error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK);
+ if (error == 0) {
+ if (enable_static) {
+ cp->c_flag |= C_SSD_STATIC;
+ }
+ else {
+ cp->c_flag &= ~C_SSD_STATIC;
+ }
+ hfs_unlock (cp);
+ }
+ return error;
+ }
+
+ case F_SETBACKINGSTORE: {
+
+ int error = 0;
+
+ /*
+ * See comment in F_SETSTATICCONTENT re: using
+ * a null check for a_data
+ */
+ if (ap->a_data) {
+ error = hfs_set_backingstore (vp, 1);
+ }
+ else {
+ error = hfs_set_backingstore (vp, 0);
+ }
+
+ return error;
+ }
+
+ case F_GETPATH_MTMINFO: {
+ int error = 0;
+
+ int *data = (int*) ap->a_data;
+
+ /* Ask if this is a backingstore vnode */
+ error = hfs_is_backingstore (vp, data);
+
+ return error;
+ }
+
case F_FULLFSYNC: {
int error;
return (error);
}
- case F_READBOOTSTRAP:
- case F_WRITEBOOTSTRAP:
- return 0;
-
case _IOC(IOC_OUT,'h', 4, 0): /* Create date in local time */
{
if (is64bit) {
*(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
break;
+ case HFS_FSCTL_GET_VERY_LOW_DISK:
+ *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
+ break;
+
case HFS_FSCTL_SET_VERY_LOW_DISK:
if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
return EINVAL;
hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
break;
+ case HFS_FSCTL_GET_LOW_DISK:
+ *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
+ break;
+
case HFS_FSCTL_SET_LOW_DISK:
if ( *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
|| *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
break;
+ case HFS_FSCTL_GET_DESIRED_DISK:
+ *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
+ break;
+
case HFS_FSCTL_SET_DESIRED_DISK:
if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
return EINVAL;
}
/* Validate if the start offset is within logical file size */
- if (ap->a_foffset > fp->ff_size) {
+ if (ap->a_foffset >= fp->ff_size) {
goto exit;
}
- /* Searching file extents has failed for read operation, therefore
- * search rangelist for any uncommitted holes in the file.
+ /*
+ * At this point, we have encountered a failure during
+ * MapFileBlockC that resulted in ERANGE, and we are not servicing
+ * a write, and there are borrowed blocks.
+ *
+ * However, the cluster layer will not call blockmap for
+ * blocks that are borrowed and in-cache. We have to assume that
+ * because we observed ERANGE being emitted from MapFileBlockC, this
+ * extent range is not valid on-disk. So we treat this as a
+ * mapping that needs to be zero-filled prior to reading.
+ *
+ * Note that under certain circumstances (such as non-contiguous
+ * userland VM mappings in the calling process), cluster_io
+ * may be forced to split a large I/O driven by hfs_vnop_write
+ * into multiple sub-I/Os that necessitate a RMW cycle. If this is
+ * the case here, then we have already removed the invalid range list
+ * mapping prior to getting to this blockmap call, so we should not
+ * search the invalid rangelist for this byte range.
*/
- overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
- ap->a_foffset + (off_t)(ap->a_size - 1),
- &invalid_range);
- switch(overlaptype) {
- case RL_OVERLAPISCONTAINED:
- /* start_offset <= rl_start, end_offset >= rl_end */
- if (ap->a_foffset != invalid_range->rl_start) {
- break;
- }
- case RL_MATCHINGOVERLAP:
- /* start_offset = rl_start, end_offset = rl_end */
- case RL_OVERLAPCONTAINSRANGE:
- /* start_offset >= rl_start, end_offset <= rl_end */
- case RL_OVERLAPSTARTSBEFORE:
- /* start_offset > rl_start, end_offset >= rl_start */
- if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
- bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
- } else {
- bytesContAvail = fp->ff_size - ap->a_foffset;
- }
- if (bytesContAvail > ap->a_size) {
- bytesContAvail = ap->a_size;
- }
- *ap->a_bpn = (daddr64_t)-1;
- retval = 0;
- break;
- case RL_OVERLAPENDSAFTER:
- /* start_offset < rl_start, end_offset < rl_end */
- case RL_NOOVERLAP:
- break;
+
+ bytesContAvail = fp->ff_size - ap->a_foffset;
+ /*
+ * Clip the contiguous available bytes to, at most, the allowable
+ * maximum or the amount requested.
+ */
+
+ if (bytesContAvail > ap->a_size) {
+ bytesContAvail = ap->a_size;
}
+
+ *ap->a_bpn = (daddr64_t) -1;
+ retval = 0;
+
goto exit;
}
return (MacToVFSError(retval));
}
-
/*
* prepare and issue the I/O
* buf_strategy knows how to deal
vnode_t vp = buf_vnode(bp);
int error = 0;
+ /* Mark buffer as containing static data if cnode flag set */
+ if (VTOC(vp)->c_flag & C_SSD_STATIC) {
+ buf_markstatic(bp);
+ }
+
#if CONFIG_PROTECT
cnode_t *cp = NULL;
if ((cp = cp_get_protected_cnode(vp)) != NULL) {
- /*
- * Some paths to hfs_vnop_strategy will take the cnode lock,
- * and some won't. But since content protection is only enabled
- * for files that (a) aren't system files and (b) are regular
- * files, any valid cnode here will be unlocked.
+ /*
+ * We rely upon the truncate lock to protect the
+ * CP cache key from getting tossed prior to our IO finishing here.
+ * Nearly all cluster io calls to manipulate file payload from HFS
+ * take the truncate lock before calling into the cluster
+ * layer to ensure the file size does not change, or that they
+ * have exclusive right to change the EOF of the file.
+ * That same guarantee protects us here since the code that
+ * deals with CP lock events must now take the truncate lock
+ * before doing anything.
+ *
+ * There is 1 exception here:
+ * 1) One exception should be the VM swapfile IO, because HFS will
+ * funnel the VNOP_PAGEOUT directly into a cluster_pageout call for the
+ * swapfile code only without holding the truncate lock. This is because
+ * individual swapfiles are maintained at fixed-length sizes by the VM code.
+ * In non-swapfile IO we use PAGEOUT_V2 semantics which allow us to
+ * create our own UPL and thus take the truncate lock before calling
+ * into the cluster layer. In that case, however, we are not concerned
+ * with the CP blob being wiped out in the middle of the IO
+ * because there isn't anything to toss; the VM swapfile key stays
+ * in-core as long as the file is open.
+ *
+ * NB:
+ * For filesystem resize, we may not have access to the underlying
+ * file's cache key for whatever reason (device may be locked). However,
+ * we do not need it since we are going to use the temporary HFS-wide resize key
+ * which is generated once we start relocating file content. If this file's I/O
+ * should be done using the resize key, it will have been supplied already, so
+ * do not attach the file's cp blob to the buffer.
*/
- hfs_lock(cp, HFS_SHARED_LOCK);
- buf_setcpaddr(bp, cp->c_cpentry);
+ if ((cp->c_cpentry->cp_flags & CP_RELOCATION_INFLIGHT) == 0) {
+ buf_setcpaddr(bp, cp->c_cpentry);
+ }
}
#endif /* CONFIG_PROTECT */
error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
-
-#if CONFIG_PROTECT
- if (cp) {
- hfs_unlock(cp);
- }
-#endif
return error;
}
struct filefork *fp = VTOF(vp);
struct cnode *cp = VTOC(vp);
+#if QUOTA
int retval = 0;
+#endif /* QUOTA */
/* Cannot truncate an HFS directory! */
if (vnode_isdir(vp)) {
* for use when deleting a file. The simplification here is that we know
* that we are releasing all blocks.
*
+ * Note that this function may be called when there is no vnode backing
+ * the file fork in question. We may call this from hfs_vnop_inactive
+ * to clear out resource fork data (and may not want to clear out the data
+ * fork yet). As a result, we pointer-check both sets of inputs before
+ * doing anything with them.
+ *
* The caller is responsible for saving off a copy of the filefork(s)
* embedded within the cnode prior to calling this function. The pointers
* supplied as arguments must be valid even if the cnode is no longer valid.
blksize = hfsmp->blockSize;
/* Data Fork */
- if (datafork->ff_blocks > 0) {
+ if ((datafork != NULL) && (datafork->ff_blocks > 0)) {
fileblocks = datafork->ff_blocks;
filebytes = (off_t)fileblocks * (off_t)blksize;
fp = VTOF(vp);
#if CONFIG_PROTECT
- if ((error = cp_handle_vnop(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != 0) {
+ if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
return error;
}
#endif /* CONFIG_PROTECT */
error = EINVAL;
goto pagein_done;
}
+ ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
+
isize = ap->a_size;
/*
a_pl_offset = 0;
/*
- * take truncate lock (shared) to guard against
- * zero-fill thru fsync interfering, but only for v2
+ * For V2 semantics, we want to take the cnode truncate lock
+ * shared to guard against the file size changing via zero-filling.
+ *
+ * However, we have to be careful because we may be invoked
+ * via the ubc_msync path to write out dirty mmap'd pages
+ * in response to a lock event on a content-protected
+ * filesystem (e.g. to write out class A files).
+ * As a result, we want to take the truncate lock 'SHARED' with
+ * the mini-recursion locktype so that we don't deadlock/panic
+ * because we may be already holding the truncate lock exclusive to force any other
+ * IOs to have blocked behind us.
*/
- hfs_lock_truncate(cp, HFS_SHARED_LOCK);
+ hfs_lock_truncate(cp, HFS_RECURSE_TRUNCLOCK);
if (a_flags & UPL_MSYNC) {
request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
pageout_done:
if (is_pageoutv2) {
- /* release truncate lock (shared) */
- hfs_unlock_truncate(cp, 0);
+ /*
+ * Release the truncate lock. Note that because
+ * we may have taken the lock recursively by
+ * being invoked via ubc_msync due to lockdown,
+ * we should release it recursively, too.
+ */
+ hfs_unlock_truncate(cp, 1);
}
return (retval);
}
return EINVAL;
}
#endif
-
/* If it's an SSD, also disable HFS relocation */
if (hfsmp->hfs_flags & HFS_SSD) {
return EINVAL;
}
+
blksize = hfsmp->blockSize;
if (blockHint == 0)
blockHint = hfsmp->nextAllocation;
hfs_unlock(VTOC(vp));
#if CONFIG_PROTECT
- if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) {
+ if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
return (error);
}
break;
}
if (uio_resid(auio) != 0) {
- printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio));
+ printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
error = EIO;
break;
}