bsd/hfs/hfs_encodinghint.c standard
bsd/hfs/hfs_encodings.c standard
bsd/hfs/hfs_endian.c optional hfs
+bsd/hfs/hfs_fsinfo.c optional hfs
bsd/hfs/hfs_hotfiles.c optional hfs
bsd/hfs/hfs_link.c optional hfs
bsd/hfs/hfs_lookup.c optional hfs
*/
/*
- * Portions copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Portions Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Portions Copyright (c) 2012 by Delphix. All rights reserved.
*/
/*
{
dtrace_speculation_t *spec;
dtrace_buffer_t *src, *dest;
- uintptr_t daddr, saddr, dlimit;
+ uintptr_t daddr, saddr, dlimit, slimit;
dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
intptr_t offs;
+ uint64_t timestamp;
if (which == 0)
return;
}
/*
- * We have the space; copy the buffer across. (Note that this is a
+ * We have sufficient space to copy the speculative buffer into the
+ * primary buffer. First, modify the speculative buffer, filling
+ * in the timestamp of all entries with the current time. The data
+ * must have the commit() time rather than the time it was traced,
+ * so that all entries in the primary buffer are in timestamp order.
+ */
+ timestamp = dtrace_gethrtime();
+ saddr = (uintptr_t)src->dtb_tomax;
+ slimit = saddr + src->dtb_offset;
+ while (saddr < slimit) {
+ size_t size;
+ dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
+
+ if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
+ saddr += sizeof (dtrace_epid_t);
+ continue;
+ }
+
+ ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
+ size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
+
+ ASSERT(saddr + size <= slimit);
+ ASSERT(size >= sizeof(dtrace_rechdr_t));
+ ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
+
+ DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
+
+ saddr += size;
+ }
+
+ /*
+ * Copy the buffer across. (Note that this is a
* highly subobtimal bcopy(); in the unlikely event that this becomes
* a serious performance issue, a high-performance DTrace-specific
* bcopy() should obviously be invented.)
tomax = buf->dtb_tomax;
ASSERT(tomax != NULL);
- if (ecb->dte_size != 0)
- DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
+ /*
+ * Build and store the record header corresponding to the ECB.
+ */
+ if (ecb->dte_size != 0) {
+ dtrace_rechdr_t dtrh;
+
+ if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
+ mstate.dtms_timestamp = dtrace_gethrtime();
+ mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
+ }
+
+ ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
+
+ dtrh.dtrh_epid = ecb->dte_epid;
+ DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
+ DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
+ }
mstate.dtms_epid = ecb->dte_epid;
mstate.dtms_present |= DTRACE_MSTATE_EPID;
continue;
switch (act->dta_kind) {
- case DTRACEACT_SPECULATE:
+ case DTRACEACT_SPECULATE: {
+ dtrace_rechdr_t *dtrh = NULL;
+
ASSERT(buf == &state->dts_buffer[cpuid]);
buf = dtrace_speculation_buffer(state,
cpuid, val);
ASSERT(tomax != NULL);
if (ecb->dte_size != 0)
- DTRACE_STORE(uint32_t, tomax, offs,
- ecb->dte_epid);
- continue;
+ continue;
+
+ ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
+ dtrh = ((void *)(tomax + offs));
+ dtrh->dtrh_epid = ecb->dte_epid;
+
+ /*
+ * When the speculation is committed, all of
+ * the records in the speculative buffer will
+ * have their timestamps set to the commit
+ * time. Until then, it is set to a sentinel
+ * value, for debugability.
+ */
+ DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
+
+ continue;
+ }
case DTRACEACT_CHILL:
if (dtrace_priv_kernel_destructive(state))
/*
* The default size is the size of the default action: recording
- * the epid.
+ * the header.
*/
- ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+ ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
ecb->dte_alignment = sizeof (dtrace_epid_t);
epid = state->dts_epid++;
static void
dtrace_ecb_resize(dtrace_ecb_t *ecb)
{
- uint32_t maxalign = sizeof (dtrace_epid_t);
- uint32_t align = sizeof (uint8_t), offs, diff;
dtrace_action_t *act;
- int wastuple = 0;
+ uint32_t curneeded = UINT32_MAX;
uint32_t aggbase = UINT32_MAX;
- dtrace_state_t *state = ecb->dte_state;
/*
- * If we record anything, we always record the epid. (And we always
- * record it first.)
+ * If we record anything, we always record the dtrace_rechdr_t. (And
+ * we always record it first.)
*/
- offs = sizeof (dtrace_epid_t);
- ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+ ecb->dte_size = sizeof (dtrace_rechdr_t);
+ ecb->dte_alignment = sizeof (dtrace_epid_t);
for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
dtrace_recdesc_t *rec = &act->dta_rec;
+ ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
- if ((align = rec->dtrd_alignment) > maxalign)
- maxalign = align;
-
- if (!wastuple && act->dta_intuple) {
- /*
- * This is the first record in a tuple. Align the
- * offset to be at offset 4 in an 8-byte aligned
- * block.
- */
- diff = offs + sizeof (dtrace_aggid_t);
-
- if ((diff = (diff & (sizeof (uint64_t) - 1))))
- offs += sizeof (uint64_t) - diff;
-
- aggbase = offs - sizeof (dtrace_aggid_t);
- ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
- }
-
- /*LINTED*/
- if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
- /*
- * The current offset is not properly aligned; align it.
- */
- offs += align - diff;
- }
-
- rec->dtrd_offset = offs;
-
- if (offs + rec->dtrd_size > ecb->dte_needed) {
- ecb->dte_needed = offs + rec->dtrd_size;
-
- if (ecb->dte_needed > state->dts_needed)
- state->dts_needed = ecb->dte_needed;
- }
+ ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
if (DTRACEACT_ISAGG(act->dta_kind)) {
dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
- dtrace_action_t *first = agg->dtag_first, *prev;
- ASSERT(rec->dtrd_size != 0 && first != NULL);
- ASSERT(wastuple);
+ ASSERT(rec->dtrd_size != 0);
+ ASSERT(agg->dtag_first != NULL);
+ ASSERT(act->dta_prev->dta_intuple);
ASSERT(aggbase != UINT32_MAX);
+ ASSERT(curneeded != UINT32_MAX);
agg->dtag_base = aggbase;
- while ((prev = first->dta_prev) != NULL &&
- DTRACEACT_ISAGG(prev->dta_kind)) {
- agg = (dtrace_aggregation_t *)prev;
- first = agg->dtag_first;
- }
+ curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+ rec->dtrd_offset = curneeded;
+ curneeded += rec->dtrd_size;
+ ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
- if (prev != NULL) {
- offs = prev->dta_rec.dtrd_offset +
- prev->dta_rec.dtrd_size;
- } else {
- offs = sizeof (dtrace_epid_t);
+ aggbase = UINT32_MAX;
+ curneeded = UINT32_MAX;
+ } else if (act->dta_intuple) {
+ if (curneeded == UINT32_MAX) {
+ /*
+ * This is the first record in a tuple. Align
+ * curneeded to be at offset 4 in an 8-byte
+ * aligned block.
+ */
+ ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
+ ASSERT(aggbase == UINT32_MAX);
+
+ curneeded = P2PHASEUP(ecb->dte_size,
+ sizeof (uint64_t), sizeof (dtrace_aggid_t));
+
+ aggbase = curneeded - sizeof (dtrace_aggid_t);
+ ASSERT(IS_P2ALIGNED(aggbase,
+ sizeof (uint64_t)));
}
- wastuple = 0;
- } else {
- if (!act->dta_intuple)
- ecb->dte_size = offs + rec->dtrd_size;
- offs += rec->dtrd_size;
+ curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+ rec->dtrd_offset = curneeded;
+ curneeded += rec->dtrd_size;
+ } else {
+ /* tuples must be followed by an aggregation */
+ ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
+ ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
+ rec->dtrd_offset = ecb->dte_size;
+ ecb->dte_size += rec->dtrd_size;
+ ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
}
-
- wastuple = act->dta_intuple;
}
if ((act = ecb->dte_action) != NULL &&
!(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
- ecb->dte_size == sizeof (dtrace_epid_t)) {
+ ecb->dte_size == sizeof (dtrace_rechdr_t)) {
/*
- * If the size is still sizeof (dtrace_epid_t), then all
+ * If the size is still sizeof (dtrace_rechdr_t), then all
* actions store no data; set the size to 0.
*/
- ecb->dte_alignment = maxalign;
ecb->dte_size = 0;
-
- /*
- * If the needed space is still sizeof (dtrace_epid_t), then
- * all actions need no additional space; set the needed
- * size to 0.
- */
- if (ecb->dte_needed == sizeof (dtrace_epid_t))
- ecb->dte_needed = 0;
-
- return;
}
- /*
- * Set our alignment, and make sure that the dte_size and dte_needed
- * are aligned to the size of an EPID.
- */
- ecb->dte_alignment = maxalign;
- ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
- ~(sizeof (dtrace_epid_t) - 1);
- ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
- ~(sizeof (dtrace_epid_t) - 1);
- ASSERT(ecb->dte_size <= ecb->dte_needed);
+ ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
+ ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
+ ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
}
static dtrace_action_t *
break;
case DTRACEACT_SPECULATE:
- if (ecb->dte_size > sizeof (dtrace_epid_t))
+ if (ecb->dte_size > sizeof (dtrace_rechdr_t))
return (EINVAL);
if (dp == NULL)
ecb->dte_action = NULL;
ecb->dte_action_last = NULL;
- ecb->dte_size = sizeof (dtrace_epid_t);
+ ecb->dte_size = 0;
}
static void
caddr_t tomax = buf->dtb_tomax;
caddr_t xamot = buf->dtb_xamot;
dtrace_icookie_t cookie;
+ hrtime_t now;
ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
cookie = dtrace_interrupt_disable();
+ now = dtrace_gethrtime();
buf->dtb_tomax = xamot;
buf->dtb_xamot = tomax;
buf->dtb_xamot_drops = buf->dtb_drops;
buf->dtb_drops = 0;
buf->dtb_errors = 0;
buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
+ buf->dtb_interval = now - buf->dtb_switched;
+ buf->dtb_switched = now;
dtrace_interrupt_enable(cookie);
}
desc.dtbd_drops = buf->dtb_drops;
desc.dtbd_errors = buf->dtb_errors;
desc.dtbd_oldest = buf->dtb_xamot_offset;
+ desc.dtbd_timestamp = dtrace_gethrtime();
lck_mtx_unlock(&dtrace_lock);
desc.dtbd_drops = buf->dtb_xamot_drops;
desc.dtbd_errors = buf->dtb_xamot_errors;
desc.dtbd_oldest = 0;
+ desc.dtbd_timestamp = buf->dtb_switched;
lck_mtx_unlock(&dtrace_lock);
/*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
/* How many free extents to cache per volume */
#define kMaxFreeExtents 10
+/* The maximum time hfs locks can be held while performing hfs statistics gathering */
+#define HFS_FSINFO_MAX_LOCKHELD_TIME 20 * 1000000ULL /* at most 20 milliseconds. */
+
/*
* HFS_MINFREE gives the minimum acceptable percentage
* of file system blocks which may be free (but this
extern int hfs_bmap(struct vnode *, daddr_t, struct vnode **, daddr64_t *, unsigned int *);
-extern int hfs_fsync(struct vnode *, int, int, struct proc *);
-
-extern int hfs_access(struct vnode *, mode_t, kauth_cred_t, struct proc *);
-
-extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid);
-
-extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state);
-
-extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks);
-
-extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock,
- u_int32_t numBlocks, u_int32_t *alloc_count);
-
-extern int hfs_isrbtree_active (struct hfsmount *hfsmp);
extern errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock);
extern int hfs_update(struct vnode *, int);
+extern int hfs_fsync(struct vnode *, int, int, struct proc *);
/*****************************************************************************
Functions from hfs_xattr.c
int hfs_xattr_write(vnode_t vp, const char *name, const void *data, size_t size);
int hfs_setxattr_internal(struct cnode *, const void *, size_t,
struct vnop_setxattr_args *, struct hfsmount *, u_int32_t);
+extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid);
+extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state);
+
/*****************************************************************************
extern cnid_t hfs_currentcnid(cnode_t *cp);
+/*****************************************************************************
+ Functions from VolumeAllocation.c
+ ******************************************************************************/
+extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock,
+ u_int32_t numBlocks);
+
+extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock,
+ u_int32_t numBlocks, u_int32_t *alloc_count);
+
+extern int hfs_isrbtree_active (struct hfsmount *hfsmp);
+
+/*****************************************************************************
+ Functions from hfs_fsinfo.c
+ ******************************************************************************/
+extern errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data);
+extern void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry);
+
#endif /* __APPLE_API_PRIVATE */
#endif /* KERNEL */
#endif /* __HFS__ */
vnode_t vp = NULL, rvp = NULL;
/*
- * Deal with any pending set sizes. We need to call
- * ubc_setsize before we drop the exclusive lock. Ideally,
- * hfs_unlock should be called before hfs_unlock_truncate but
- * that's a lot to ask people to remember :-)
+ * If there are pending set sizes, the cnode lock should be dropped
+ * first.
*/
+#if DEBUG
+ assert(!(cp->c_lockowner == thread
+ && ISSET(cp->c_flag, C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE)));
+#elif DEVELOPMENT
if (cp->c_lockowner == thread
&& ISSET(cp->c_flag, C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE)) {
- // hfs_unlock will do the setsize calls for us
- hfs_unlock(cp);
- hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
+ printf("hfs: hfs_unlock_truncate called with C_NEED_DATA/RSRC_SETSIZE set (caller: 0x%llx)\n",
+ (uint64_t)VM_KERNEL_UNSLIDE(__builtin_return_address(0)));
}
-
+#endif
+
if (cp->c_need_dvnode_put_after_truncate_unlock) {
vp = cp->c_vp;
cp->c_need_dvnode_put_after_truncate_unlock = false;
/*
- * Copyright (c) 2004-2014 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2004-2015 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
};
+// Will be deprecated and replaced by hfs_fsinfo
struct hfsinfo_metadata {
uint32_t total;
uint32_t extents;
uint32_t reserved[4];
};
+/*
+ * Flags for hfs_fsinfo_data structure
+ */
+#define HFS_FSINFO_CLASS_A 0x0001 /* Information for class A files requested */
+#define HFS_FSINFO_CLASS_B 0x0002 /* Information for class B files requested */
+#define HFS_FSINFO_CLASS_C 0x0004 /* Information for class C files requested */
+#define HFS_FSINFO_CLASS_D 0x0008 /* Information for class D files requested */
+
+/*
+ * Maximum number of buckets to represent range from 0 to 1TB (2^40) in
+ * increments of power of 2, and one catch-all bucket for anything that
+ * is greater than 1TB
+ */
+#define HFS_FSINFO_DATA_MAX_BUCKETS 42
+
+/*
+ * Maximum number of buckets to represents percentage range from 0 to 100
+ * in increments of 10.
+ */
+#define HFS_FSINFO_PERCENT_MAX_BUCKETS 10
+
+/*
+ * Maximum number of buckets to represent number of file/directory name characters
+ * (range 1 to 255) in increments of 5.
+ */
+#define HFS_FSINFO_NAME_MAX_BUCKETS 51
+
+/*
+ * Version number to ensure that the caller and the kernel have same understanding
+ * of the hfs_fsinfo_data structure. This version needs to be bumped whenever the
+ * number of buckets is changed.
+ */
+#define HFS_FSINFO_VERSION 1
+
+/*
+ * hfs_fsinfo_data is generic data structure to aggregate information like sizes
+ * or counts in buckets of power of 2. Each bucket represents a range of values
+ * that is determined based on its index in the array. Specifically, buckets[i]
+ * represents values that are greater than or equal to 2^(i-1) and less than 2^i,
+ * except the last bucket which represents range greater than or equal to 2^(i-1)
+ *
+ * The current maximum number of buckets is 41, so we can represent range from
+ * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of
+ * anything that is greater than or equal to 1TB.
+ *
+ * For example,
+ * bucket[0] -> greater than or equal to 0 and less than 1
+ * bucket[1] -> greater than or equal to 1 and less than 2
+ * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024
+ * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB
+ * bucket[41] -> greater than or equal to 2^(41-1) = 1TB
+ *
+ * Note that fsctls that populate this data structure can take long time to
+ * execute as this operation can be I/O intensive (traversing btrees) and compute
+ * intensive.
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+
+/*
+ * The header includes the user input fields.
+ */
+typedef struct hfs_fsinfo_header {
+ uint32_t request_type;
+ uint16_t version;
+ uint16_t flags;
+} hfs_fsinfo_header_t;
+
+struct hfs_fsinfo_data {
+ hfs_fsinfo_header_t header;
+ uint32_t bucket[HFS_FSINFO_DATA_MAX_BUCKETS];
+};
+
+/*
+ * Structure to represent information about metadata files
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+struct hfs_fsinfo_metadata {
+ hfs_fsinfo_header_t header;
+ uint32_t extents;
+ uint32_t catalog;
+ uint32_t allocation;
+ uint32_t attribute;
+ uint32_t journal;
+};
+
+/*
+ * Structure to represent distribution of number of file name characters
+ * in increments of 5s. Each bucket represents a range of values that is
+ * determined based on its index in the array. So bucket[i] represents values
+ * that are greater than or equal to (i*5) and less than ((i+1)*10).
+ *
+ * Since this structure represents range of file name characters and the
+ * maximum number of unicode characters in HFS+ is 255, the maximum number
+ * of buckets will be 52 [0..51].
+ *
+ * For example,
+ * bucket[4] -> greater than or equal to 20 and less than 25 characters
+ * bucket[51] -> equal to 255 characters
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+struct hfs_fsinfo_name {
+ hfs_fsinfo_header_t header;
+ uint32_t bucket[HFS_FSINFO_NAME_MAX_BUCKETS];
+};
+
+/*
+ * Structure to represent information about content protection classes
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+struct hfs_fsinfo_cprotect {
+ hfs_fsinfo_header_t header;
+ uint32_t class_A;
+ uint32_t class_B;
+ uint32_t class_C;
+ uint32_t class_D;
+ uint32_t class_E;
+ uint32_t class_F;
+};
+
+/*
+ * Union of all the different values returned by HFSIOC_FSINFO fsctl
+ */
+union hfs_fsinfo {
+ hfs_fsinfo_header_t header;
+ struct hfs_fsinfo_data data;
+ struct hfs_fsinfo_metadata metadata;
+ struct hfs_fsinfo_name name;
+ struct hfs_fsinfo_cprotect cprotect;
+};
+typedef union hfs_fsinfo hfs_fsinfo;
+
+/*
+ * Type of FSINFO requested, specified by the caller in request_type field
+ */
+enum {
+ /* Information about number of allocation blocks for each metadata file, returns struct hfs_fsinfo_metadata */
+ HFS_FSINFO_METADATA_BLOCKS_INFO = 1,
+
+ /* Information about number of extents for each metadata file, returns struct hfs_fsinfo_metadata */
+ HFS_FSINFO_METADATA_EXTENTS = 2,
+
+ /* Information about percentage of free nodes vs used nodes in metadata btrees, returns struct hfs_fsinfo_metadata */
+ HFS_FSINFO_METADATA_PERCENTFREE = 3,
+
+ /* Distribution of number of extents for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */
+ HFS_FSINFO_FILE_EXTENT_COUNT = 4,
+
+ /* Distribution of extent sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */
+ HFS_FSINFO_FILE_EXTENT_SIZE = 5,
+
+ /* Distribution of file sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */
+ HFS_FSINFO_FILE_SIZE = 6,
+
+ /* Distribution of valence for all directories, returns struct hfs_fsinfo_data */
+ HFS_FSINFO_DIR_VALENCE = 7,
+
+ /* Distribution of file/directory name size in unicode characters, returns struct hfs_fsinfo_name */
+ HFS_FSINFO_NAME_SIZE = 8,
+
+ /* Distribution of extended attribute sizes, returns hfs_fsinfo_data */
+ HFS_FSINFO_XATTR_SIZE = 9,
+
+ /* Distribution of free space for the entire file system, returns struct hfs_fsinfo_data */
+ HFS_FSINFO_FREE_EXTENTS = 10,
+
+ /* Information about number of files belonging to each class, returns hfs_fsinfo_cprotect */
+ HFS_FSINFO_FILE_CPROTECT_COUNT = 11,
+
+ /*
+ * Distribution of symbolic link sizes for data files (data fork, no rsrc fork, no xattr),
+ * returns struct hfs_fsinfo_data
+ */
+ HFS_FSINFO_SYMLINK_SIZE = 12,
+};
+
/* HFS FS CONTROL COMMANDS */
/*
+ * XXX: Will be deprecated and replaced by HFSIOC_GET_FSINFO
+ *
* Get information about number of file system allocation blocks used by metadata
* files on the volume, including individual btrees and journal file. The caller
* can determine the size of file system allocation block using value returned as
#define HFSIOC_CS_FREESPACE_TRIM _IOWR('h', 39, u_int32_t)
#define HFS_CS_FREESPACE_TRIM IOCBASECMD(HFSIOC_CS_FREESPACE_TRIM)
+/* Get file system information for the given volume */
+#define HFSIOC_GET_FSINFO _IOWR('h', 45, hfs_fsinfo)
+#define HFS_GET_FSINFO IOCBASECMD(HFSIOC_GET_FSINFO)
+
#endif /* __APPLE_API_UNSTABLE */
#endif /* ! _HFS_FSCTL_H_ */
--- /dev/null
+/*
+ * Copyright (c) 2014-2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <sys/cprotect.h>
+#include <sys/xattr.h>
+#include <sys/utfconv.h>
+#include <libkern/OSByteOrder.h>
+#include <kern/kalloc.h>
+#include <sys/stat.h>
+
+#include "hfs.h"
+#include "hfs_fsctl.h"
+#include "hfs_endian.h"
+#include "hfscommon/headers/BTreesInternal.h"
+#include "hfscommon/headers/BTreesPrivate.h"
+#include "hfscommon/headers/FileMgrInternal.h"
+
+#if CONFIG_PROTECT
+#include <hfs/hfs_cprotect.h>
+#endif
+
+
+union HFSPlusRecord {
+ HFSPlusCatalogFolder folder_record;
+ HFSPlusCatalogFile file_record;
+ HFSPlusCatalogThread thread_record;
+ HFSPlusExtentRecord extent_record;
+ HFSPlusAttrRecord attr_record;
+};
+typedef union HFSPlusRecord HFSPlusRecord;
+
+union HFSPlusKey {
+ HFSPlusExtentKey extent_key;
+ HFSPlusAttrKey attr_key;
+};
+typedef union HFSPlusKey HFSPlusKey;
+
+typedef enum traverse_btree_flag {
+
+ //If set, extents btree will also be traversed along with catalog btree, so grab correct locks upfront
+ TRAVERSE_BTREE_EXTENTS = 1,
+
+ // Getting content-protection attributes, allocate enough space to accomodate the records.
+ TRAVERSE_BTREE_XATTR_CPROTECT = 2,
+
+} traverse_btree_flag_t;
+
+
+
+static errno_t hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo);
+static errno_t hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo);
+static errno_t hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo);
+static errno_t fsinfo_file_extent_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_file_extent_size_catalog_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_file_extent_size_overflow_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_file_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_dir_valence_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_name_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_xattr_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, traverse_btree_flag_t flags, void *fsinfo,
+ int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *));
+static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo);
+static void fsinfo_free_extents_callback(void *data, off_t free_extent_size);
+#if CONFIG_PROTECT
+static errno_t fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+#endif
+static errno_t fsinfo_symlink_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+
+/*
+ * Entry function for all the fsinfo requests from hfs_vnop_ioctl()
+ * Depending on the type of request, this function will call the
+ * appropriate sub-function and return success or failure back to
+ * the caller.
+ */
+__private_extern__
+errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data)
+{
+ int error = 0;
+ hfs_fsinfo *fsinfo_union;
+ uint32_t request_type;
+ uint32_t header_len = sizeof(hfs_fsinfo_header_t);
+
+ fsinfo_union = (hfs_fsinfo *)a_data;
+ request_type = fsinfo_union->header.request_type;
+
+ // Zero out output fields to fsinfo_union, keep the user input fields intact.
+ bzero((char *)fsinfo_union + header_len, sizeof(hfs_fsinfo) - header_len);
+
+ switch (request_type) {
+ case HFS_FSINFO_METADATA_BLOCKS_INFO:
+ error = hfs_fsinfo_metadata_blocks(hfsmp, &(fsinfo_union->metadata));
+ break;
+
+ case HFS_FSINFO_METADATA_EXTENTS:
+ error = hfs_fsinfo_metadata_extents(hfsmp, &(fsinfo_union->metadata));
+ break;
+
+ case HFS_FSINFO_METADATA_PERCENTFREE:
+ error = hfs_fsinfo_metadata_percentfree(hfsmp, &(fsinfo_union->metadata));
+ break;
+
+ case HFS_FSINFO_FILE_EXTENT_COUNT:
+ /* Traverse catalog btree and invoke callback for all records */
+ error = traverse_btree(hfsmp, kHFSCatalogFileID, TRAVERSE_BTREE_EXTENTS, &(fsinfo_union->data), fsinfo_file_extent_count_callback);
+ break;
+
+ case HFS_FSINFO_FILE_EXTENT_SIZE:
+ /* Traverse the catalog btree first */
+ error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_catalog_callback);
+ if (error) {
+ break;
+ }
+ /* Traverse the overflow extents btree now */
+ error = traverse_btree(hfsmp, kHFSExtentsFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_overflow_callback);
+ break;
+
+ case HFS_FSINFO_FILE_SIZE:
+ /* Traverse catalog btree and invoke callback for all records */
+ error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_size_callback);
+ break;
+
+ case HFS_FSINFO_DIR_VALENCE:
+ /* Traverse catalog btree and invoke callback for all records */
+ error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_dir_valence_callback);
+ break;
+
+ case HFS_FSINFO_NAME_SIZE:
+ /* Traverse catalog btree and invoke callback for all records */
+ error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->name), &fsinfo_name_size_callback);
+ break;
+
+ case HFS_FSINFO_XATTR_SIZE:
+ /* Traverse attribute btree and invoke callback for all records */
+ error = traverse_btree(hfsmp, kHFSAttributesFileID, 0, &(fsinfo_union->data), &fsinfo_xattr_size_callback);
+ break;
+
+ case HFS_FSINFO_FREE_EXTENTS:
+ error = hfs_fsinfo_free_extents(hfsmp, &(fsinfo_union->data));
+ break;
+
+ case HFS_FSINFO_SYMLINK_SIZE:
+ /* Traverse catalog btree and invoke callback for all records */
+ error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_symlink_size_callback);
+ break;
+
+#if CONFIG_PROTECT
+ case HFS_FSINFO_FILE_CPROTECT_COUNT:
+ /* Traverse attribute btree and invoke callback for all records */
+ error = traverse_btree(hfsmp, kHFSAttributesFileID, TRAVERSE_BTREE_XATTR_CPROTECT, &(fsinfo_union->cprotect), &fsinfo_cprotect_count_callback);
+ break;
+#endif
+
+ default:
+ return ENOTSUP;
+ };
+
+ return error;
+}
+
+/*
+ * This function provides information about total number of allocation blocks
+ * for each individual metadata file.
+ */
+static errno_t
+hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo)
+{
+ int lockflags = 0;
+ int ret_lockflags = 0;
+
+ /*
+ * Getting number of allocation blocks for all metadata files
+ * should be a relatively quick operation, so we grab locks for all
+ * the btrees at the same time
+ */
+ lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE;
+ ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+
+ /* Get information about all the btrees */
+ fsinfo->extents = hfsmp->hfs_extents_cp->c_datafork->ff_blocks;
+ fsinfo->catalog = hfsmp->hfs_catalog_cp->c_datafork->ff_blocks;
+ fsinfo->allocation = hfsmp->hfs_allocation_cp->c_datafork->ff_blocks;
+ if (hfsmp->hfs_attribute_cp)
+ fsinfo->attribute = hfsmp->hfs_attribute_cp->c_datafork->ff_blocks;
+ else
+ fsinfo->attribute = 0;
+
+ /* Done with btrees, give up the locks */
+ hfs_systemfile_unlock(hfsmp, ret_lockflags);
+
+ /* Get information about journal file */
+ fsinfo->journal = howmany(hfsmp->jnl_size, hfsmp->blockSize);
+
+ return 0;
+}
+
+/*
+ * Helper function to count the number of valid extents in a file fork structure
+ */
+static uint32_t
+hfs_count_extents_fp(struct filefork *ff)
+{
+ int i;
+ uint32_t count = 0;
+ for (i = 0; i < kHFSPlusExtentDensity; i++) {
+ if (ff->ff_data.cf_extents[i].blockCount == 0) {
+ break;
+ }
+ count++;
+ }
+ return count;
+}
+
+
+/*
+ * This is a helper function that counts the total number of valid
+ * extents in all the overflow extent records for given fileID
+ * in overflow extents btree
+ */
+static errno_t
+hfs_count_overflow_extents(struct hfsmount *hfsmp, uint32_t fileID, uint32_t *num_extents)
+{
+ int error;
+ FCB *fcb;
+ struct BTreeIterator *iterator = NULL;
+ FSBufferDescriptor btdata;
+ HFSPlusExtentKey *extentKey;
+ HFSPlusExtentRecord extentData;
+ uint32_t extent_count = 0;
+ int i;
+
+ fcb = VTOF(hfsmp->hfs_extents_vp);
+ MALLOC(iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK | M_ZERO);
+
+ extentKey = (HFSPlusExtentKey *) &iterator->key;
+ extentKey->keyLength = kHFSPlusExtentKeyMaximumLength;
+ extentKey->forkType = kHFSDataForkType;
+ extentKey->fileID = fileID;
+ extentKey->startBlock = 0;
+
+ btdata.bufferAddress = &extentData;
+ btdata.itemSize = sizeof(HFSPlusExtentRecord);
+ btdata.itemCount = 1;
+
+ /* Search for overflow extent record */
+ error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator);
+
+ /*
+ * We used startBlock of zero, so we will not find any records and errors
+ * are expected. It will also position the iterator just before the first
+ * overflow extent record for given fileID (if any).
+ */
+ if (error && error != fsBTRecordNotFoundErr && error != fsBTEndOfIterationErr)
+ goto out;
+ error = 0;
+
+ for (;;) {
+
+ if (msleep(NULL, NULL, PINOD | PCATCH,
+ "hfs_fsinfo", NULL) == EINTR) {
+ error = EINTR;
+ break;
+ }
+
+ error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
+ if (error != 0) {
+ /* These are expected errors, so mask them */
+ if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
+ error = 0;
+ }
+ break;
+ }
+
+ /* If we encounter different fileID, stop the iteration */
+ if (extentKey->fileID != fileID) {
+ break;
+ }
+
+ if (extentKey->forkType != kHFSDataForkType)
+ break;
+
+ /* This is our record of interest; only count the datafork extents. */
+ for (i = 0; i < kHFSPlusExtentDensity; i++) {
+ if (extentData[i].blockCount == 0) {
+ break;
+ }
+ extent_count++;
+ }
+ }
+
+out:
+ FREE(iterator, M_TEMP);
+
+ if (error == 0) {
+ *num_extents = extent_count;
+ }
+ return MacToVFSError(error);
+}
+
+/*
+ * This function provides information about total number of extents (including
+ * extents from overflow extents btree, if any) for each individual metadata
+ * file.
+ */
+static errno_t
+hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo)
+{
+ int error = 0;
+ int lockflags = 0;
+ int ret_lockflags = 0;
+ uint32_t overflow_count;
+
+ /*
+ * Counting the number of extents for all metadata files should
+ * be a relatively quick operation, so we grab locks for all the
+ * btrees at the same time
+ */
+ lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE;
+ ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+
+ /* Get number of extents for extents overflow btree */
+ fsinfo->extents = hfs_count_extents_fp(hfsmp->hfs_extents_cp->c_datafork);
+
+ /* Get number of extents for catalog btree */
+ fsinfo->catalog = hfs_count_extents_fp(hfsmp->hfs_catalog_cp->c_datafork);
+ if (fsinfo->catalog >= kHFSPlusExtentDensity) {
+ error = hfs_count_overflow_extents(hfsmp, kHFSCatalogFileID, &overflow_count);
+ if (error) {
+ goto out;
+ }
+ fsinfo->catalog += overflow_count;
+ }
+
+ /* Get number of extents for allocation file */
+ fsinfo->allocation = hfs_count_extents_fp(hfsmp->hfs_allocation_cp->c_datafork);
+ if (fsinfo->allocation >= kHFSPlusExtentDensity) {
+ error = hfs_count_overflow_extents(hfsmp, kHFSAllocationFileID, &overflow_count);
+ if (error) {
+ goto out;
+ }
+ fsinfo->allocation += overflow_count;
+ }
+
+ /*
+ * Get number of extents for attribute btree.
+ * hfs_attribute_cp might be NULL.
+ */
+ if (hfsmp->hfs_attribute_cp) {
+ fsinfo->attribute = hfs_count_extents_fp(hfsmp->hfs_attribute_cp->c_datafork);
+ if (fsinfo->attribute >= kHFSPlusExtentDensity) {
+ error = hfs_count_overflow_extents(hfsmp, kHFSAttributesFileID, &overflow_count);
+ if (error) {
+ goto out;
+ }
+ fsinfo->attribute += overflow_count;
+ }
+ }
+ /* Journal always has one extent */
+ fsinfo->journal = 1;
+out:
+ hfs_systemfile_unlock(hfsmp, ret_lockflags);
+ return error;
+}
+
+/*
+ * Helper function to calculate percentage i.e. X is what percent of Y?
+ */
+static inline uint32_t
+hfs_percent(uint32_t X, uint32_t Y)
+{
+ return (X * 100ll) / Y;
+}
+
+/*
+ * This function provides percentage of free nodes vs total nodes for each
+ * individual metadata btrees, i.e. for catalog, overflow extents and
+ * attributes btree. This information is not applicable for allocation
+ * file and journal file.
+ */
+static errno_t
+hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo)
+{
+ int lockflags = 0;
+ int ret_lockflags = 0;
+ BTreeControlBlockPtr btreePtr;
+ uint32_t free_nodes, total_nodes;
+
+ /*
+ * Getting total and used nodes for all metadata btrees should
+ * be a relatively quick operation, so we grab locks for all the
+ * btrees at the same time
+ */
+ lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE;
+ ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+
+ /* Overflow extents btree */
+ btreePtr = VTOF(hfsmp->hfs_extents_vp)->fcbBTCBPtr;
+ total_nodes = btreePtr->totalNodes;
+ free_nodes = btreePtr->freeNodes;
+ fsinfo->extents = hfs_percent(free_nodes, total_nodes);
+
+ /* Catalog btree */
+ btreePtr = VTOF(hfsmp->hfs_catalog_vp)->fcbBTCBPtr;
+ total_nodes = btreePtr->totalNodes;
+ free_nodes = btreePtr->freeNodes;
+ fsinfo->catalog = hfs_percent(free_nodes, total_nodes);
+
+ /* Attributes btree */
+ if (hfsmp->hfs_attribute_vp) {
+ btreePtr = VTOF(hfsmp->hfs_attribute_vp)->fcbBTCBPtr;
+ total_nodes = btreePtr->totalNodes;
+ free_nodes = btreePtr->freeNodes;
+ fsinfo->attribute = hfs_percent(free_nodes, total_nodes);
+ }
+
+ hfs_systemfile_unlock(hfsmp, ret_lockflags);
+ return 0;
+}
+
+/*
+ * Helper function to calculate log base 2 for given number
+ */
+static inline int
+hfs_log2(uint64_t entry)
+{
+ return (63 - __builtin_clzll(entry|1));
+}
+
+/*
+ * Helper function to account for input entry into the data
+ * array based on its log base 2 value
+ */
+__private_extern__
+void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry)
+{
+ /*
+ * From hfs_fsctl.h -
+ *
+ * hfs_fsinfo_data is generic data structure to aggregate information like sizes
+ * or counts in buckets of power of 2. Each bucket represents a range of values
+ * that is determined based on its index in the array. Specifically, buckets[i]
+ * represents values that are greater than or equal to 2^(i-1) and less than 2^i,
+ * except the last bucket which represents range greater than or equal to 2^(i-1)
+ *
+ * The current maximum number of buckets is 41, so we can represent range from
+ * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of
+ * anything that is greater than or equal to 1TB.
+ *
+ * For example,
+ * bucket[0] -> greater than or equal to 0 and less than 1
+ * bucket[1] -> greater than or equal to 1 and less than 2
+ * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024
+ * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB
+ * bucket[41] -> greater than or equal to 2^(41-1) = 1TB
+ */
+ uint32_t bucket;
+
+ if (entry) {
+ /*
+ * Calculate log base 2 value for the entry.
+ * Account for this value in the appropriate bucket.
+ * The last bucket is a catch-all bucket of
+ * anything that is greater than or equal to 1TB
+ */
+ bucket = MIN(hfs_log2(entry) + 1, HFS_FSINFO_DATA_MAX_BUCKETS-1);
+ ++fsinfo->bucket[bucket];
+ } else {
+ /* Entry is zero, so account it in 0th offset */
+ fsinfo->bucket[0]++;
+ }
+}
+
+/*
+ * Function to traverse all the records of a btree and then call caller-provided
+ * callback function for every record found. The type of btree is chosen based
+ * on the fileID provided by the caller. This fuction grabs the correct locks
+ * depending on the type of btree it will be traversing and flags provided
+ * by the caller.
+ *
+ * Note: It might drop and reacquire the locks during execution.
+ */
+static errno_t
+traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, traverse_btree_flag_t flags,
+ void *fsinfo, int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *))
+{
+ int error = 0;
+ int lockflags = 0;
+ int ret_lockflags = 0;
+ FCB *fcb;
+ struct BTreeIterator *iterator = NULL;
+ struct FSBufferDescriptor btdata;
+ int btree_operation;
+ HFSPlusRecord record;
+ HFSPlusKey *key;
+ uint64_t start, timeout_abs;
+
+ switch(btree_fileID) {
+ case kHFSExtentsFileID:
+ fcb = VTOF(hfsmp->hfs_extents_vp);
+ lockflags = SFL_EXTENTS;
+ break;
+ case kHFSCatalogFileID:
+ fcb = VTOF(hfsmp->hfs_catalog_vp);
+ lockflags = SFL_CATALOG;
+ break;
+ case kHFSAttributesFileID:
+ // Attributes file doesn’t exist, There are no records to iterate.
+ if (hfsmp->hfs_attribute_vp == NULL)
+ return error;
+ fcb = VTOF(hfsmp->hfs_attribute_vp);
+ lockflags = SFL_ATTRIBUTE;
+ break;
+
+ default:
+ return EINVAL;
+ }
+
+ MALLOC(iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK | M_ZERO);
+
+ /* The key is initialized to zero because we are traversing entire btree */
+ key = (HFSPlusKey *)&iterator->key;
+
+ if (flags & TRAVERSE_BTREE_EXTENTS) {
+ lockflags |= SFL_EXTENTS;
+ }
+
+ btdata.bufferAddress = &record;
+ btdata.itemSize = sizeof(HFSPlusRecord);
+ btdata.itemCount = 1;
+
+ /* Lock btree for duration of traversal */
+ ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+ btree_operation = kBTreeFirstRecord;
+
+ nanoseconds_to_absolutetime(HFS_FSINFO_MAX_LOCKHELD_TIME, &timeout_abs);
+ start = mach_absolute_time();
+
+ while (1) {
+
+ if (msleep(NULL, NULL, PINOD | PCATCH,
+ "hfs_fsinfo", NULL) == EINTR) {
+ error = EINTR;
+ break;
+ }
+
+ error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
+ if (error != 0) {
+ if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
+ error = 0;
+ }
+ break;
+ }
+ /* Lookup next btree record on next call to BTIterateRecord() */
+ btree_operation = kBTreeNextRecord;
+
+ /* Call our callback function and stop iteration if there are any errors */
+ error = callback(hfsmp, key, &record, fsinfo);
+ if (error) {
+ break;
+ }
+
+ /* let someone else use the tree after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME */
+ if ((mach_absolute_time() - start) >= timeout_abs) {
+
+ /* release b-tree locks and let someone else get the lock */
+ hfs_systemfile_unlock (hfsmp, ret_lockflags);
+
+ /* add tsleep here to force context switch and fairness */
+ tsleep((caddr_t)hfsmp, PRIBIO, "hfs_fsinfo", 1);
+
+ /*
+ * re-acquire the locks in the same way that we wanted them originally.
+ * note: it is subtle but worth pointing out that in between the time that we
+ * released and now want to re-acquire these locks that the b-trees may have shifted
+ * slightly but significantly. For example, the catalog or other b-tree could have grown
+ * past 8 extents and now requires the extents lock to be held in order to be safely
+ * manipulated. We can't be sure of the state of the b-tree from where we last left off.
+ */
+
+ ret_lockflags = hfs_systemfile_lock (hfsmp, lockflags, HFS_SHARED_LOCK);
+
+ /*
+ * It's highly likely that the search key we stashed away before dropping lock
+ * no longer points to an existing item. Iterator's IterateRecord is able to
+ * re-position itself and process the next record correctly. With lock dropped,
+ * there might be records missed for statistic gathering, which is ok. The
+ * point is to get aggregate values.
+ */
+
+ start = mach_absolute_time();
+
+ /* loop back around and get another record */
+ }
+ }
+
+ hfs_systemfile_unlock(hfsmp, ret_lockflags);
+ FREE (iterator, M_TEMP);
+ return MacToVFSError(error);
+}
+
+/*
+ * Callback function to get distribution of number of extents
+ * for all user files in given file system. Note that this only
+ * accounts for data fork, no resource fork.
+ */
+static errno_t
+fsinfo_file_extent_count_callback(struct hfsmount *hfsmp,
+ __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+ int i;
+ int error = 0;
+ uint32_t num_extents = 0;
+ uint32_t num_overflow = 0;
+ uint32_t blockCount;
+
+ if (record->file_record.recordType == kHFSPlusFileRecord) {
+ /* Count total number of extents for this file */
+ for (i = 0; i < kHFSPlusExtentDensity; i++) {
+ blockCount = record->file_record.dataFork.extents[i].blockCount;
+ if (blockCount == 0) {
+ break;
+ }
+ num_extents++;
+ }
+ /* This file has overflow extent records, so search overflow btree */
+ if (num_extents >= kHFSPlusExtentDensity) {
+ /* The caller also hold extents overflow btree lock */
+ error = hfs_count_overflow_extents(hfsmp, record->file_record.fileID, &num_overflow);
+ if (error) {
+ goto out;
+ }
+ num_extents += num_overflow;
+ }
+ hfs_fsinfo_data_add(data, num_extents);
+ }
+out:
+ return error;
+}
+
+/*
+ * Callback function to get distribution of individual extent sizes
+ * (in bytes) for all user files in given file system from catalog
+ * btree only. Note that this only accounts for data fork, no resource
+ * fork.
+ */
+static errno_t fsinfo_file_extent_size_catalog_callback(__unused struct hfsmount *hfsmp,
+ __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+ int i;
+ uint32_t blockCount;
+ uint64_t extent_size;
+
+ if (record->file_record.recordType == kHFSPlusFileRecord) {
+ /* Traverse through all valid extents */
+ for (i = 0; i < kHFSPlusExtentDensity; i++) {
+ blockCount = record->file_record.dataFork.extents[i].blockCount;
+ if (blockCount == 0) {
+ break;
+ }
+ extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize);
+ hfs_fsinfo_data_add(data, extent_size);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Callback function to get distribution of individual extent sizes
+ * (in bytes) for all user files in given file system from overflow
+ * extents btree only. Note that this only accounts for data fork,
+ * no resource fork.
+ */
+static errno_t fsinfo_file_extent_size_overflow_callback(__unused struct hfsmount *hfsmp,
+ HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+ int i;
+ uint32_t blockCount;
+ uint64_t extent_size;
+
+ if (key->extent_key.fileID >= kHFSFirstUserCatalogNodeID) {
+ // Only count the data fork extents.
+ if (key->extent_key.forkType == kHFSDataForkType) {
+ for (i = 0; i < kHFSPlusExtentDensity; i++) {
+ blockCount = record->extent_record[i].blockCount;
+ if (blockCount == 0) {
+ break;
+ }
+ extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize);
+ hfs_fsinfo_data_add(data, extent_size);
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * Callback function to get distribution of file sizes (in bytes)
+ * for all user files in given file system. Note that this only
+ * accounts for data fork, no resource fork.
+ */
+static errno_t fsinfo_file_size_callback(__unused struct hfsmount *hfsmp,
+ __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+ if (record->file_record.recordType == kHFSPlusFileRecord) {
+ /* Record of interest, account for the size in the bucket */
+ hfs_fsinfo_data_add(data, record->file_record.dataFork.logicalSize);
+ }
+ return 0;
+}
+
+/*
+ * Callback function to get distribution of directory valence
+ * for all directories in the given file system.
+ */
+static errno_t fsinfo_dir_valence_callback(__unused struct hfsmount *hfsmp,
+ __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+ if (record->folder_record.recordType == kHFSPlusFolderRecord) {
+ hfs_fsinfo_data_add(data, record->folder_record.valence);
+ }
+ return 0;
+}
+
+/*
+ * Callback function to get distribution of number of unicode
+ * characters in name for all files and directories for a given
+ * file system.
+ */
+static errno_t fsinfo_name_size_callback(__unused struct hfsmount *hfsmp,
+ __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+ struct hfs_fsinfo_name *fsinfo = (struct hfs_fsinfo_name *)data;
+ uint32_t length;
+
+ if ((record->folder_record.recordType == kHFSPlusFolderThreadRecord) ||
+ (record->folder_record.recordType == kHFSPlusFileThreadRecord)) {
+ length = record->thread_record.nodeName.length;
+ /* Make sure that the nodeName is bounded, otherwise return error */
+ if (length > kHFSPlusMaxFileNameChars) {
+ return EIO;
+ }
+
+ // sanity check for a name length of zero, which isn't valid on disk.
+ if (length == 0)
+ return EIO;
+
+ /* Round it down to nearest multiple of 5 to match our buckets granularity */
+ length = (length - 1)/ 5;
+ /* Account this value into our bucket */
+ fsinfo->bucket[length]++;
+ }
+ return 0;
+}
+
+/*
+ * Callback function to get distribution of size of all extended
+ * attributes for a given file system.
+ */
+static errno_t fsinfo_xattr_size_callback(__unused struct hfsmount *hfsmp,
+ __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+ if (record->attr_record.recordType == kHFSPlusAttrInlineData) {
+ /* Inline attribute */
+ hfs_fsinfo_data_add(data, record->attr_record.attrData.attrSize);
+ } else if (record->attr_record.recordType == kHFSPlusAttrForkData) {
+ /* Larger attributes with extents information */
+ hfs_fsinfo_data_add(data, record->attr_record.forkData.theFork.logicalSize);
+ }
+ return 0;
+}
+
+
+/*
+ * Callback function to get distribution of free space extents for a given file system.
+ */
+static void fsinfo_free_extents_callback(void *data, off_t free_extent_size)
+{
+ // Assume a minimum of 4 KB block size
+ hfs_fsinfo_data_add(data, free_extent_size / 4096);
+}
+
+/*
+ * Function to get distribution of free space extents for a given file system.
+ */
+static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo)
+{
+ return hfs_find_free_extents(hfsmp, &fsinfo_free_extents_callback, fsinfo);
+}
+
+/*
+ * Callback function to get distribution of symblock link sizes (in bytes)
+ * for all user files in given file system. Note that this only
+ * accounts for data fork, no resource fork.
+ */
+static errno_t fsinfo_symlink_size_callback(__unused struct hfsmount *hfsmp,
+ __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+ if (record->file_record.recordType == kHFSPlusFileRecord) {
+ /* Record of interest, account for the size in the bucket */
+ if (S_ISLNK(record->file_record.bsdInfo.fileMode))
+ hfs_fsinfo_data_add((struct hfs_fsinfo_data *)data, record->file_record.dataFork.logicalSize);
+ }
+ return 0;
+}
+
+#if CONFIG_PROTECT
+/*
+ * Callback function to get total number of files/directories
+ * for each content protection class
+ */
+static int fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key,
+ HFSPlusRecord *record, void *data)
+{
+ struct hfs_fsinfo_cprotect *fsinfo = (struct hfs_fsinfo_cprotect *)data;
+ static const uint16_t cp_xattrname_utf16[] = CONTENT_PROTECTION_XATTR_NAME_CHARS;
+ static const size_t cp_xattrname_utf16_len = sizeof(cp_xattrname_utf16)/2;
+ struct cp_xattr_v5 *xattr;
+ size_t xattr_len = sizeof(struct cp_xattr_v5);
+ struct cprotect cp_entry;
+ struct cprotect *cp_entryp = &cp_entry;
+ int error = 0;
+
+ /* Content protect xattrs are inline attributes only, so skip all others */
+ if (record->attr_record.recordType != kHFSPlusAttrInlineData)
+ return 0;
+
+ /* We only look at content protection xattrs */
+ if ((key->attr_key.attrNameLen != cp_xattrname_utf16_len) ||
+ (bcmp(key->attr_key.attrName, cp_xattrname_utf16, cp_xattrname_utf16_len))) {
+ return 0;
+ }
+
+ xattr = (struct cp_xattr_v5 *)((void *)(record->attr_record.attrData.attrData));
+ error = cp_read_xattr_v5(hfsmp, xattr, xattr_len, (cprotect_t *)&cp_entryp,
+ CP_GET_XATTR_BASIC_INFO);
+ if (error)
+ return 0;
+
+ /* No key present, skip this record */
+ if (!ISSET(cp_entry.cp_flags, CP_HAS_A_KEY))
+ return 0;
+
+ /* Now account for the persistent class */
+ switch (CP_CLASS(cp_entry.cp_pclass)) {
+ case PROTECTION_CLASS_A:
+ fsinfo->class_A++;
+ break;
+ case PROTECTION_CLASS_B:
+ fsinfo->class_B++;
+ break;
+ case PROTECTION_CLASS_C:
+ fsinfo->class_C++;
+ break;
+ case PROTECTION_CLASS_D:
+ fsinfo->class_D++;
+ break;
+ case PROTECTION_CLASS_E:
+ fsinfo->class_E++;
+ break;
+ case PROTECTION_CLASS_F:
+ fsinfo->class_F++;
+ break;
+ };
+
+ return 0;
+}
+#endif
/*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
int
hfs_vnop_ioctl( struct vnop_ioctl_args /* {
vnode_t a_vp;
- int a_command;
+ long a_command;
caddr_t a_data;
int a_fflag;
vfs_context_t a_context;
break;
}
+ case HFS_GET_FSINFO: {
+ hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
+
+ /* Only root is allowed to get fsinfo */
+ if (!kauth_cred_issuser(kauth_cred_get())) {
+ return EACCES;
+ }
+
+ /*
+ * Make sure that the caller's version number matches with
+ * the kernel's version number. This will make sure that
+ * if the structures being read/written into are changed
+ * by the kernel, the caller will not read incorrect data.
+ *
+ * The first three fields --- request_type, version and
+ * flags are same for all the hfs_fsinfo structures, so
+ * we can access the version number by assuming any
+ * structure for now.
+ */
+ if (fsinfo->header.version != HFS_FSINFO_VERSION) {
+ return ENOTSUP;
+ }
+
+ /* Make sure that the current file system is not marked inconsistent */
+ if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
+ return EIO;
+ }
+
+ return hfs_get_fsinfo(hfsmp, ap->a_data);
+ }
+
case HFS_CS_FREESPACE_TRIM: {
int error = 0;
int lockflags = 0;
* truncate lock)
*/
rm_done:
- hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
hfs_unlockpair(dcp, cp);
+ hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
if (recycle_rsrc) {
/* inactive or reclaim on rvp will clean up the blocks from the rsrc fork */
wakeup((caddr_t)&tdcp->c_flag);
}
+ hfs_unlockfour(fdcp, fcp, tdcp, tcp);
+
if (took_trunc_lock) {
hfs_unlock_truncate(VTOC(tvp), HFS_LOCK_DEFAULT);
}
- hfs_unlockfour(fdcp, fcp, tdcp, tcp);
-
/* Now vnode_put the resource forks vnodes if necessary */
if (tvp_rsrc) {
vnode_put(tvp_rsrc);
if ( i != kNumExtentsToCache ) // if the buffer is not full, we must be done
{
- err = DeleteExtents( vcb, srcFileID, forkType, quitEarly, isHFSPlus ); // Now delete all the extent entries with the sourceID
+ err = DeleteExtents( vcb, srcFileID, quitEarly, forkType, isHFSPlus ); // Now delete all the extent entries with the sourceID
if ( DEBUG_BUILD && err != noErr )
DebugStr("Error from DeleteExtents");
break; // we're done!
/*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
*/
-#include "../../hfs_macos_defs.h"
-
#include <sys/types.h>
#include <sys/buf.h>
+
+
+#if !HFS_ALLOC_TEST
+
+#include "../../hfs_macos_defs.h"
#include <sys/systm.h>
-#include <sys/sysctl.h>
-#include <sys/disk.h>
#include <sys/ubc.h>
-#include <sys/uio.h>
#include <kern/kalloc.h>
-#include <sys/malloc.h>
/* For VM Page size */
#include <libkern/libkern.h>
-
#include "../../hfs.h"
-#include "../../hfs_dbg.h"
-#include "../../hfs_format.h"
#include "../../hfs_endian.h"
-#include "../../hfs_macos_defs.h"
#include "../headers/FileMgrInternal.h"
+#include <vfs/vfs_journal.h>
+
+#endif // !HFS_ALLOC_TEST
+
+#include <sys/sysctl.h>
+#include <sys/disk.h>
+#include <sys/uio.h>
+#include <sys/malloc.h>
+
+#include "../../hfs_dbg.h"
+#include "../../hfs_format.h"
#include "../../hfs_kdebug.h"
/* Headers for unmap-on-mount support */
-#include <vfs/vfs_journal.h>
#include <sys/disk.h>
#ifndef CONFIG_HFS_TRIM
static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount);
static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated);
+/* Functions for getting free exents */
+
+typedef struct bitmap_context {
+ void *bitmap; // current bitmap chunk
+ uint32_t run_offset; // offset (in bits) from start of bitmap to start of current run
+ uint32_t chunk_current; // next bit to scan in the chunk
+ uint32_t chunk_end; // number of valid bits in this chunk
+ struct hfsmount *hfsmp;
+ struct buf *bp;
+ uint32_t last_free_summary_bit; // last marked free summary bit
+ int lockflags;
+ uint64_t lock_start;
+} bitmap_context_t;
+
+
+static errno_t get_more_bits(bitmap_context_t *bitmap_ctx);
+static int bit_count_set(void *bitmap, int start, int end);
+static int bit_count_clr(void *bitmap, int start, int end);
+static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count);
+static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count);
+static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count);
+static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set);
+static int clzll(uint64_t x);
+
#if ALLOC_DEBUG
/*
* Validation Routine to verify that the TRIM list maintained by the journal
lck_spin_unlock(&hfsmp->vcbFreeExtLock);
}
+#define BIT_RIGHT_MASK(bit) (0xffffffffffffffffull >> (bit))
+#define kHighBitInDoubleWordMask 0x8000000000000000ull
+
+static int clzll(uint64_t x)
+{
+ if (x == 0)
+ return 64;
+ else
+ return __builtin_clzll(x);
+}
+
+#if !HFS_ALLOC_TEST
+
+static errno_t get_more_bits(bitmap_context_t *bitmap_ctx)
+{
+ uint32_t start_bit;
+ uint32_t iosize = 0;
+ uint32_t byte_offset;
+ uint32_t last_bitmap_block;
+ int error;
+ struct hfsmount *hfsmp = bitmap_ctx->hfsmp;
+#if !HFS_ALLOC_TEST
+ uint64_t lock_elapsed;
+#endif
+
+
+ if (bitmap_ctx->bp)
+ ReleaseScanBitmapRange(bitmap_ctx->bp);
+
+ if (msleep(NULL, NULL, PINOD | PCATCH,
+ "hfs_fsinfo", NULL) == EINTR) {
+ return EINTR;
+ }
+
+#if !HFS_ALLOC_TEST
+ /*
+ * Let someone else use the allocation map after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME .
+ * lock_start is initialized in hfs_find_free_extents().
+ */
+ absolutetime_to_nanoseconds(mach_absolute_time() - bitmap_ctx->lock_start, &lock_elapsed);
+
+ if (lock_elapsed >= HFS_FSINFO_MAX_LOCKHELD_TIME) {
+
+ hfs_systemfile_unlock(hfsmp, bitmap_ctx->lockflags);
+
+ /* add tsleep here to force context switch and fairness */
+ tsleep((caddr_t)get_more_bits, PRIBIO, "hfs_fsinfo", 1);
+
+ hfs_journal_lock(hfsmp);
+
+ /* Flush the journal and wait for all I/Os to finish up */
+ error = hfs_journal_flush(hfsmp, TRUE);
+ if (error) {
+ hfs_journal_unlock(hfsmp);
+ return error;
+ }
+
+ /*
+ * Take bitmap lock to ensure it is not being modified while journal is still held.
+ * Since we are reading larger than normal blocks from the bitmap, which
+ * might confuse other parts of the bitmap code using normal blocks, we
+ * take exclusive lock here.
+ */
+ bitmap_ctx->lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
+
+ bitmap_ctx->lock_start = mach_absolute_time();
+
+ /* Release the journal lock */
+ hfs_journal_unlock(hfsmp);
+
+ /*
+ * Bitmap is read in large block size (up to 1MB),
+ * unlike the runtime which reads the bitmap in the
+ * 4K block size. If the bitmap is read by both ways
+ * at the same time, it can result in multiple buf_t with
+ * different sizes and potentially case data corruption.
+ * To avoid this, we invalidate all the existing buffers
+ * associated with the bitmap vnode.
+ */
+ error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
+ if (error) {
+ /* hfs_systemfile_unlock will be called in the caller */
+ return error;
+ }
+ }
+#endif
+
+ start_bit = bitmap_ctx->run_offset;
+
+ if (start_bit >= bitmap_ctx->hfsmp->totalBlocks) {
+ bitmap_ctx->chunk_end = 0;
+ bitmap_ctx->bp = NULL;
+ bitmap_ctx->bitmap = NULL;
+ return 0;
+ }
+
+ assert(start_bit % 8 == 0);
+
+ /*
+ * Compute how much I/O we should generate here.
+ * hfs_scan_range_size will validate that the start bit
+ * converted into a byte offset into the bitmap file,
+ * is aligned on a VBMIOSize boundary.
+ */
+ error = hfs_scan_range_size (bitmap_ctx->hfsmp, start_bit, &iosize);
+ if (error)
+ return error;
+
+ /* hfs_scan_range_size should have verified startbit. Convert it to bytes */
+ byte_offset = start_bit / kBitsPerByte;
+
+ /*
+ * When the journal replays blocks, it does so by writing directly to the disk
+ * device (bypassing any filesystem vnodes and such). When it finishes its I/Os
+ * it also immediately re-reads and invalidates the range covered by the bp so
+ * it does not leave anything lingering in the cache (for iosize reasons).
+ *
+ * As such, it is safe to do large I/Os here with ReadBitmapRange.
+ *
+ * NOTE: It is not recommended, but it is possible to call the function below
+ * on sections of the bitmap that may be in core already as long as the pages are not
+ * dirty. In that case, we'd notice that something starting at that
+ * logical block of the bitmap exists in the metadata cache, and we'd check
+ * if the iosize requested is the same as what was already allocated for it.
+ * Odds are pretty good we're going to request something larger. In that case,
+ * we just free the existing memory associated with the buf and reallocate a
+ * larger range. This function should immediately invalidate it as soon as we're
+ * done scanning, so this shouldn't cause any coherency issues.
+ */
+ error = ReadBitmapRange(bitmap_ctx->hfsmp, byte_offset, iosize, (uint32_t **)&bitmap_ctx->bitmap, &bitmap_ctx->bp);
+ if (error)
+ return error;
+
+ /*
+ * At this point, we have a giant wired buffer that represents some portion of
+ * the bitmap file that we want to analyze. We may not have gotten all 'iosize'
+ * bytes though, so clip our ending bit to what we actually read in.
+ */
+ last_bitmap_block = start_bit + buf_count(bitmap_ctx->bp) * kBitsPerByte;
+
+ /* Cap the last block to the total number of blocks if required */
+ if (last_bitmap_block > bitmap_ctx->hfsmp->totalBlocks)
+ last_bitmap_block = bitmap_ctx->hfsmp->totalBlocks;
+
+ bitmap_ctx->chunk_current = 0; // new chunk of bitmap
+ bitmap_ctx->chunk_end = last_bitmap_block - start_bit;
+
+ return 0;
+}
+
+#endif // !HFS_ALLOC_TEST
+
+// Returns number of contiguous bits set at start
+static int bit_count_set(void *bitmap, int start, int end)
+{
+ if (start == end)
+ return 0;
+
+ assert(end > start);
+
+ const int start_bit = start & 63;
+ const int end_bit = end & 63;
+
+ uint64_t *p = (uint64_t *)bitmap + start / 64;
+ uint64_t x = ~OSSwapBigToHostInt64(*p);
+
+ if ((start & ~63) == (end & ~63)) {
+ // Start and end in same 64 bits
+ x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit);
+ return clzll(x) - start_bit;
+ }
+
+ // Deal with initial unaligned bit
+ x &= BIT_RIGHT_MASK(start_bit);
+
+ if (x)
+ return clzll(x) - start_bit;
+
+ // Go fast
+ ++p;
+ int count = 64 - start_bit;
+ int nquads = (end - end_bit - start - 1) / 64;
+
+ while (nquads--) {
+ if (*p != 0xffffffffffffffffull) {
+ x = ~OSSwapBigToHostInt64(*p);
+ return count + clzll(x);
+ }
+ ++p;
+ count += 64;
+ }
+
+ if (end_bit) {
+ x = ~OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit);
+ count += clzll(x);
+ }
+
+ return count;
+}
+
+/* Returns the number of a run of cleared bits:
+ * bitmap is a single chunk of memory being examined
+ * start: the start bit relative to the current buffer to be examined; start is inclusive.
+ * end: the end bit relative to the current buffer to be examined; end is not inclusive.
+ */
+static int bit_count_clr(void *bitmap, int start, int end)
+{
+ if (start == end)
+ return 0;
+
+ assert(end > start);
+
+ const int start_bit = start & 63;
+ const int end_bit = end & 63;
+
+ uint64_t *p = (uint64_t *)bitmap + start / 64;
+ uint64_t x = OSSwapBigToHostInt64(*p);
+
+ if ((start & ~63) == (end & ~63)) {
+ // Start and end in same 64 bits
+ x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit);
+
+ return clzll(x) - start_bit;
+ }
+
+ // Deal with initial unaligned bit
+ x &= BIT_RIGHT_MASK(start_bit);
+
+ if (x)
+ return clzll(x) - start_bit;
+
+ // Go fast
+ ++p;
+ int count = 64 - start_bit;
+ int nquads = (end - end_bit - start - 1) / 64;
+
+ while (nquads--) {
+ if (*p) {
+ x = OSSwapBigToHostInt64(*p);
+ return count + clzll(x);
+ }
+ ++p;
+ count += 64;
+ }
+
+ if (end_bit) {
+ x = OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit);
+
+ count += clzll(x);
+ }
+
+ return count;
+}
+
+#if !HFS_ALLOC_TEST
+static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set)
+{
+ uint32_t end, start_summary_bit, end_summary_bit;
+ errno_t error = 0;
+
+ if (count == 0)
+ goto out;
+
+ if (!ISSET(bitmap_ctx->hfsmp->hfs_flags, HFS_SUMMARY_TABLE))
+ return 0;
+
+ if (hfs_get_summary_index (bitmap_ctx->hfsmp, start, &start_summary_bit)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ end = start + count - 1;
+ if (hfs_get_summary_index (bitmap_ctx->hfsmp, end, &end_summary_bit)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ // if summary table bit has been updated with free block previously, leave it.
+ if ((start_summary_bit == bitmap_ctx->last_free_summary_bit) && set)
+ start_summary_bit++;
+
+ for (uint32_t summary_bit = start_summary_bit; summary_bit <= end_summary_bit; summary_bit++)
+ hfs_set_summary (bitmap_ctx->hfsmp, summary_bit, set);
+
+ if (!set)
+ bitmap_ctx->last_free_summary_bit = end_summary_bit;
+
+out:
+ return error;
+
+}
+#endif //!HFS_ALLOC_TEST
+
+/*
+ * Read in chunks of the bitmap into memory, and find a run of cleared/set bits;
+ * the run can extend across chunk boundaries.
+ * bit_count_clr can be passed to get a run of cleared bits.
+ * bit_count_set can be passed to get a run of set bits.
+ */
+static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count)
+{
+ int count;
+ errno_t error = 0;
+
+ *bit_count = 0;
+
+ do {
+ if (bitmap_ctx->run_offset == 0 || bitmap_ctx->chunk_current == bitmap_ctx->chunk_end) {
+ if ((error = get_more_bits(bitmap_ctx)) != 0)
+ goto out;
+ }
+
+ if (bitmap_ctx->chunk_end == 0)
+ break;
+
+ count = fn(bitmap_ctx->bitmap, bitmap_ctx->chunk_current, bitmap_ctx->chunk_end);
+
+ bitmap_ctx->run_offset += count;
+ bitmap_ctx->chunk_current += count;
+ *bit_count += count;
+
+ } while (bitmap_ctx->chunk_current >= bitmap_ctx->chunk_end && count);
+
+out:
+ return error;
+
+}
+
+// Returns count of number of bits clear
+static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count)
+{
+ return hfs_bit_count(bitmap_ctx, bit_count_clr, count);
+}
+
+// Returns count of number of bits set
+static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count)
+{
+ return hfs_bit_count(bitmap_ctx, bit_count_set, count);
+}
+
+static uint32_t hfs_bit_offset(bitmap_context_t *bitmap_ctx)
+{
+ return bitmap_ctx->run_offset;
+}
+
+/*
+ * Perform a full scan of the bitmap file.
+ * Note: during the scan of bitmap file, it may drop and reacquire the
+ * bitmap lock to let someone else use the bitmap for fairness.
+ * Currently it is used by HFS_GET_FSINFO statistic gathing, which
+ * is run while other processes might perform HFS operations.
+ */
+
+errno_t hfs_find_free_extents(struct hfsmount *hfsmp,
+ void (*callback)(void *data, off_t free_extent_size), void *callback_arg)
+{
+ struct bitmap_context bitmap_ctx;
+ uint32_t count;
+ errno_t error = 0;
+
+ if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
+ error = hfs_init_summary(hfsmp);
+ if (error)
+ return error;
+ }
+
+ bzero(&bitmap_ctx, sizeof(struct bitmap_context));
+
+ /*
+ * The journal maintains list of recently deallocated blocks to
+ * issue DKIOCUNMAPs when the corresponding journal transaction is
+ * flushed to the disk. To avoid any race conditions, we only
+ * want one active trim list. Therefore we make sure that the
+ * journal trim list is sync'ed, empty, and not modifiable for
+ * the duration of our scan.
+ *
+ * Take the journal lock before flushing the journal to the disk.
+ * We will keep on holding the journal lock till we don't get the
+ * bitmap lock to make sure that no new journal transactions can
+ * start. This will make sure that the journal trim list is not
+ * modified after the journal flush and before getting bitmap lock.
+ * We can release the journal lock after we acquire the bitmap
+ * lock as it will prevent any further block deallocations.
+ */
+ hfs_journal_lock(hfsmp);
+
+ /* Flush the journal and wait for all I/Os to finish up */
+ error = hfs_journal_flush(hfsmp, TRUE);
+ if (error) {
+ hfs_journal_unlock(hfsmp);
+ return error;
+ }
+
+ /*
+ * Take bitmap lock to ensure it is not being modified.
+ * Since we are reading larger than normal blocks from the bitmap, which
+ * might confuse other parts of the bitmap code using normal blocks, we
+ * take exclusive lock here.
+ */
+ bitmap_ctx.lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
+
+#if !HFS_ALLOC_TEST
+ bitmap_ctx.lock_start = mach_absolute_time();
+#endif
+
+ /* Release the journal lock */
+ hfs_journal_unlock(hfsmp);
+
+ /*
+ * Bitmap is read in large block size (up to 1MB),
+ * unlike the runtime which reads the bitmap in the
+ * 4K block size. If the bitmap is read by both ways
+ * at the same time, it can result in multiple buf_t with
+ * different sizes and potentially case data corruption.
+ * To avoid this, we invalidate all the existing buffers
+ * associated with the bitmap vnode.
+ */
+ error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
+ if (error)
+ goto out;
+
+ /*
+ * Get the list of all free extent ranges. hfs_alloc_scan_range()
+ * will call hfs_fsinfo_data_add() to account for all the free
+ * extent ranges found during scan.
+ */
+ bitmap_ctx.hfsmp = hfsmp;
+ bitmap_ctx.run_offset = 0;
+
+ while (bitmap_ctx.run_offset < hfsmp->totalBlocks) {
+
+ uint32_t start = hfs_bit_offset(&bitmap_ctx);
+
+ if ((error = hfs_bit_count_clr(&bitmap_ctx, &count)) != 0)
+ goto out;
+
+ if (count)
+ callback(callback_arg, hfs_blk_to_bytes(count, hfsmp->blockSize));
+
+ if ((error = update_summary_table(&bitmap_ctx, start, count, false)) != 0)
+ goto out;
+
+ start = hfs_bit_offset(&bitmap_ctx);
+
+ if ((error = hfs_bit_count_set(&bitmap_ctx, &count)) != 0)
+ goto out;
+
+ if ((error = update_summary_table(&bitmap_ctx, start, count, true)) != 0)
+ goto out;
+ }
+
+out:
+ if (bitmap_ctx.lockflags) {
+ hfs_systemfile_unlock(hfsmp, bitmap_ctx.lockflags);
+ }
+
+ return error;
+}
+
/*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <sys/param.h>
#include <sys/vnode.h>
+#if !HFS_ALLOC_TEST
+
#include "../../hfs.h"
#include "../../hfs_macos_defs.h"
#include "../../hfs_format.h"
#include "../../hfs_cnode.h"
+#endif
#ifdef __cplusplus
extern "C" {
EXTERN_API_C( int )
hfs_init_summary (struct hfsmount *hfsmp);
+errno_t hfs_find_free_extents(struct hfsmount *hfsmp,
+ void (*callback)(void *data, off_t), void *callback_arg);
+
/* File Extent Mapping routines*/
EXTERN_API_C( OSErr )
FlushExtentFile (ExtendedVCB * vcb);
static void parse_bsd_args(void);
extern task_t bsd_init_task;
+extern boolean_t init_task_died;
extern char init_task_failure_data[];
#if CONFIG_DEV_KMEM
extern void dev_kmem_init(void);
ut = (uthread_t)get_bsdthread_info(thread);
bsd_init_task = get_threadtask(thread);
+ init_task_died = FALSE;
init_task_failure_data[0] = 0;
#if CONFIG_MACF
#include <machine/pal_routines.h>
+extern boolean_t kdebug_serial;
+#if KDEBUG_MOJO_TRACE
+#include <sys/kdebugevents.h>
+static void kdebug_serial_print( /* forward */
+ uint32_t, uint32_t, uint64_t,
+ uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+#endif
+
/*
* IOP(s)
*
#define MACH_SysCall 0x010c0000
#define DBG_SCALL_MASK 0xffff0000
-
/* task to string structure */
struct tts
{
{
int s = ml_set_interrupts_enabled(FALSE);
lck_spin_lock(kds_spin_lock);
-
if (enabled) {
kdebug_enable |= trace_type;
kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG;
kdbp = &kdbip[coreid];
timestamp &= KDBG_TIMESTAMP_MASK;
+#if KDEBUG_MOJO_TRACE
+ if (kdebug_enable & KDEBUG_ENABLE_SERIAL)
+ kdebug_serial_print(coreid, debugid, timestamp,
+ arg1, arg2, arg3, arg4, threadid);
+#endif
+
retry_q:
kds_raw = kdbp->kd_list_tail;
cpu = cpu_number();
kdbp = &kdbip[cpu];
+
+#if KDEBUG_MOJO_TRACE
+ if (kdebug_enable & KDEBUG_ENABLE_SERIAL)
+ kdebug_serial_print(cpu, debugid,
+ mach_absolute_time() & KDBG_TIMESTAMP_MASK,
+ arg1, arg2, arg3, arg4, arg5);
+#endif
+
retry_q:
kds_raw = kdbp->kd_list_tail;
/* Stuff the message string in the args and log it. */
strncpy((char *)arg, message, MIN(sizeof(arg), strlen(message)));
KERNEL_DEBUG_EARLY(
- (TRACEDBG_CODE(DBG_TRACE_INFO, 4)) | DBG_FUNC_NONE,
+ TRACE_INFO_STRING,
arg[0], arg[1], arg[2], arg[3]);
}
uintptr_t arg4)
{
/* If tracing is already initialized, use it */
- if (nkdbufs)
+ if (nkdbufs) {
KERNEL_DEBUG_CONSTANT(debugid, arg1, arg2, arg3, arg4, 0);
+ return;
+ }
/* Do nothing if the buffer is full or we're not on the boot cpu */
kd_early_overflow = kd_early_index >= KD_EARLY_BUFFER_MAX;
}
/*
- * Transfer the contents of the temporary buffer into the trace buffers.
+ * Transfen the contents of the temporary buffer into the trace buffers.
* Precede that by logging the rebase time (offset) - the TSC-based time (in ns)
* when mach_absolute_time is set to 0.
*/
/* Fake sentinel marking the start of kernel time relative to TSC */
kernel_debug_enter(
0,
- (TRACEDBG_CODE(DBG_TRACE_INFO, 1)) | DBG_FUNC_NONE,
+ TRACE_TIMESTAMPS,
0,
(uint32_t)(tsc_rebase_abs_time >> 32),
(uint32_t)tsc_rebase_abs_time,
/* Cut events-lost event on overflow */
if (kd_early_overflow)
KERNEL_DEBUG_CONSTANT(
- TRACEDBG_CODE(DBG_TRACE_INFO, 2), 0, 0, 0, 0, 0);
+ TRACE_LOST_EVENTS, 0, 0, 0, 0, 0);
/* This trace marks the start of kernel tracing */
kernel_debug_string("early trace done");
if (name[0] == KERN_KDWRITETR) {
number = nkdbufs * sizeof(kd_buf);
- KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_START, 0, 0, 0, 0, 0);
+ KERNEL_DEBUG_CONSTANT(TRACE_WRITING_EVENTS | DBG_FUNC_START, 0, 0, 0, 0, 0);
ret = kdbg_read(0, &number, vp, &context);
- KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_END, number, 0, 0, 0, 0);
+ KERNEL_DEBUG_CONSTANT(TRACE_WRITING_EVENTS | DBG_FUNC_END, number, 0, 0, 0, 0);
*sizep = number;
} else {
return EINVAL;
memset(&lostevent, 0, sizeof(lostevent));
- lostevent.debugid = TRACEDBG_CODE(DBG_TRACE_INFO, 2);
+ lostevent.debugid = TRACE_LOST_EVENTS;
/* Capture timestamp. Only sort events that have occured before the timestamp.
* Since the iop is being flushed here, its possible that events occur on the AP
/* Hold off interrupts until the early traces are cut */
boolean_t s = ml_set_interrupts_enabled(FALSE);
- kdbg_set_tracing_enabled(TRUE, KDEBUG_ENABLE_TRACE);
+ kdbg_set_tracing_enabled(
+ TRUE,
+ kdebug_serial ?
+ (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_SERIAL) :
+ KDEBUG_ENABLE_TRACE);
/*
* Transfer all very early events from the static buffer
ml_set_interrupts_enabled(s);
printf("kernel tracing started\n");
+#if KDEBUG_MOJO_TRACE
+ if (kdebug_serial) {
+ printf("serial output enabled with %lu named events\n",
+ sizeof(kd_events)/sizeof(kd_event_t));
+ }
+#endif
} else {
- printf("error from kdbg_reinit,kernel tracing not started\n");
+ printf("error from kdbg_reinit, kernel tracing not started\n");
}
}
return;
}
}
- KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 0)) | DBG_FUNC_NONE, 0, 0, 0, 0, 0);
+ KERNEL_DEBUG_CONSTANT(TRACE_PANIC | DBG_FUNC_NONE, 0, 0, 0, 0, 0);
kdebug_enable = 0;
kd_ctrl_page.enabled = 0;
else
snprintf(name_buf, len, "%p [!bsd]", task);
}
+
+#if KDEBUG_MOJO_TRACE
+static kd_event_t *
+binary_search(uint32_t id)
+{
+ int low, high, mid;
+
+ low = 0;
+ high = sizeof(kd_events)/sizeof(kd_event_t) - 1;
+
+ while (TRUE)
+ {
+ mid = (low + high) / 2;
+
+ if (low > high)
+ return NULL; /* failed */
+ else if ( low + 1 >= high) {
+ /* We have a match */
+ if (kd_events[high].id == id)
+ return &kd_events[high];
+ else if (kd_events[low].id == id)
+ return &kd_events[low];
+ else
+ return NULL; /* search failed */
+ }
+ else if (id < kd_events[mid].id)
+ high = mid;
+ else
+ low = mid;
+ }
+}
+
+/*
+ * Look up event id to get name string.
+ * Using a per-cpu cache of a single entry
+ * before resorting to a binary search of the full table.
+ */
+#define NCACHE 1
+static kd_event_t *last_hit[MAX_CPUS];
+static kd_event_t *
+event_lookup_cache(uint32_t cpu, uint32_t id)
+{
+ if (last_hit[cpu] == NULL || last_hit[cpu]->id != id)
+ last_hit[cpu] = binary_search(id);
+ return last_hit[cpu];
+}
+
+static uint64_t kd_last_timstamp;
+
+static void
+kdebug_serial_print(
+ uint32_t cpunum,
+ uint32_t debugid,
+ uint64_t timestamp,
+ uintptr_t arg1,
+ uintptr_t arg2,
+ uintptr_t arg3,
+ uintptr_t arg4,
+ uintptr_t threadid
+ )
+{
+ char kprintf_line[192];
+ char event[40];
+ uint64_t us = timestamp / NSEC_PER_USEC;
+ uint64_t us_tenth = (timestamp % NSEC_PER_USEC) / 100;
+ uint64_t delta = timestamp - kd_last_timstamp;
+ uint64_t delta_us = delta / NSEC_PER_USEC;
+ uint64_t delta_us_tenth = (delta % NSEC_PER_USEC) / 100;
+ uint32_t event_id = debugid & DBG_FUNC_MASK;
+ const char *command;
+ const char *bra;
+ const char *ket;
+ kd_event_t *ep;
+
+ /* event time and delta from last */
+ snprintf(kprintf_line, sizeof(kprintf_line),
+ "%11llu.%1llu %8llu.%1llu ",
+ us, us_tenth, delta_us, delta_us_tenth);
+
+
+ /* event (id or name) - start prefixed by "[", end postfixed by "]" */
+ bra = (debugid & DBG_FUNC_START) ? "[" : " ";
+ ket = (debugid & DBG_FUNC_END) ? "]" : " ";
+ ep = event_lookup_cache(cpunum, event_id);
+ if (ep) {
+ if (strlen(ep->name) < sizeof(event) - 3)
+ snprintf(event, sizeof(event), "%s%s%s",
+ bra, ep->name, ket);
+ else
+ snprintf(event, sizeof(event), "%s%x(name too long)%s",
+ bra, event_id, ket);
+ } else {
+ snprintf(event, sizeof(event), "%s%x%s",
+ bra, event_id, ket);
+ }
+ snprintf(kprintf_line + strlen(kprintf_line),
+ sizeof(kprintf_line) - strlen(kprintf_line),
+ "%-40s ", event);
+
+ /* arg1 .. arg4 with special cases for strings */
+ switch (event_id) {
+ case VFS_LOOKUP:
+ case VFS_LOOKUP_DONE:
+ if (debugid & DBG_FUNC_START) {
+ /* arg1 hex then arg2..arg4 chars */
+ snprintf(kprintf_line + strlen(kprintf_line),
+ sizeof(kprintf_line) - strlen(kprintf_line),
+ "%-16lx %-8s%-8s%-8s ",
+ arg1, (char*)&arg2, (char*)&arg3, (char*)&arg4);
+ break;
+ }
+ /* else fall through for arg1..arg4 chars */
+ case TRACE_STRING_EXEC:
+ case TRACE_STRING_NEWTHREAD:
+ case TRACE_INFO_STRING:
+ snprintf(kprintf_line + strlen(kprintf_line),
+ sizeof(kprintf_line) - strlen(kprintf_line),
+ "%-8s%-8s%-8s%-8s ",
+ (char*)&arg1, (char*)&arg2, (char*)&arg3, (char*)&arg4);
+ break;
+ default:
+ snprintf(kprintf_line + strlen(kprintf_line),
+ sizeof(kprintf_line) - strlen(kprintf_line),
+ "%-16lx %-16lx %-16lx %-16lx",
+ arg1, arg2, arg3, arg4);
+ }
+
+ /* threadid, cpu and command name */
+ if (threadid == (uintptr_t)thread_tid(current_thread()) &&
+ current_proc() &&
+ current_proc()->p_comm)
+ command = current_proc()->p_comm;
+ else
+ command = "-";
+ snprintf(kprintf_line + strlen(kprintf_line),
+ sizeof(kprintf_line) - strlen(kprintf_line),
+ " %-16lx %-2d %s\n",
+ threadid, cpunum, command);
+
+ kprintf("%s", kprintf_line);
+ kd_last_timstamp = timestamp;
+}
+#endif
/*
- * Copyright (c) 1999-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
struct sockaddr_ctl sa;
struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb;
struct ctl_cb *kcb_next = NULL;
+ u_quad_t sbmaxsize;
+ u_int32_t recvbufsize, sendbufsize;
if (kcb == 0)
panic("ctl_connect so_pcb null\n");
kctlstat.kcs_connections++;
lck_mtx_unlock(ctl_mtx);
- error = soreserve(so, kctl->sendbufsize, kctl->recvbufsize);
+ /*
+ * rdar://15526688: Limit the send and receive sizes to sb_max
+ * by using the same scaling as sbreserve()
+ */
+ sbmaxsize = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
+
+ if (kctl->sendbufsize > sbmaxsize)
+ sendbufsize = sbmaxsize;
+ else
+ sendbufsize = kctl->sendbufsize;
+
+ if (kctl->recvbufsize > sbmaxsize)
+ recvbufsize = sbmaxsize;
+ else
+ recvbufsize = kctl->recvbufsize;
+
+ error = soreserve(so, sendbufsize, recvbufsize);
if (error) {
printf("%s - soreserve(%llx, %u, %u) error %d\n", __func__,
(uint64_t)VM_KERNEL_ADDRPERM(so),
- kctl->sendbufsize, kctl->recvbufsize, error);
+ sendbufsize, recvbufsize, error);
goto done;
}
soisconnecting(so);
struct sockbuf *sb = &so->so_rcv;
u_int32_t space = sbspace(sb);
errno_t error;
-
+
if ((kctl->flags & CTL_FLAG_REG_CRIT) == 0) {
if ((u_int32_t) space >= datasize)
error = 0;
{
struct kctl *kctl = NULL;
struct kctl *kctl_next = NULL;
- u_int32_t id = 1;
- size_t name_len;
- int is_extended = 0;
- u_quad_t sbmaxsize;
+ u_int32_t id = 1;
+ size_t name_len;
+ int is_extended = 0;
if (userkctl == NULL) /* sanity check */
return (EINVAL);
/*
* Let the caller know the default send and receive sizes
- *
- * rdar://15526688: Limit the send and receive sizes to sb_max
- * by using the same scaling as sbreserve()
*/
- sbmaxsize = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
-
- if (userkctl->ctl_sendsize == 0)
+ if (userkctl->ctl_sendsize == 0) {
kctl->sendbufsize = CTL_SENDSIZE;
- else if (userkctl->ctl_sendsize > sbmaxsize)
- kctl->sendbufsize = sbmaxsize;
- else
- kctl->sendbufsize = userkctl->ctl_sendsize;
- userkctl->ctl_sendsize = kctl->sendbufsize;
-
- if (userkctl->ctl_recvsize == 0)
+ userkctl->ctl_sendsize = kctl->sendbufsize;
+ } else {
+ kctl->sendbufsize = userkctl->ctl_sendsize;
+ }
+ if (userkctl->ctl_recvsize == 0) {
kctl->recvbufsize = CTL_RECVSIZE;
- else if (userkctl->ctl_recvsize > sbmaxsize)
- kctl->recvbufsize = sbmaxsize;
- else
- kctl->recvbufsize = userkctl->ctl_recvsize;
- userkctl->ctl_recvsize = kctl->recvbufsize;
+ userkctl->ctl_recvsize = kctl->recvbufsize;
+ } else {
+ kctl->recvbufsize = userkctl->ctl_recvsize;
+ }
kctl->connect = userkctl->ctl_connect;
kctl->disconnect = userkctl->ctl_disconnect;
#include <kern/clock.h>
#include <kern/thread_call.h>
#include <kern/sched_prim.h>
+#include <kern/wait_queue.h>
#include <kern/zalloc.h>
#include <kern/assert.h>
int oktodrop;
oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
+ kn->kn_status &= ~KN_STAYQUEUED;
kn->kn_status |= KN_DROPPING;
if (oktodrop) {
if (kn->kn_inuse == 0) {
kq->kq_p = p;
} else {
FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE);
+ kq = NULL;
}
}
kern_return_t kr;
kr = wait_queue_unlink_nofree(wq, kq->kq_wqs, wqlp);
- kqlock(kq);
- kn->kn_status &= ~KN_STAYQUEUED;
- knote_dequeue(kn);
- kqunlock(kq);
+ knote_clearstayqueued(kn);
return ((kr != KERN_SUCCESS) ? EINVAL : 0);
}
knote_enqueue(kn);
kqunlock(kn->kn_kq);
}
+
+void
+knote_clearstayqueued(struct knote *kn)
+{
+ kqlock(kn->kn_kq);
+ kn->kn_status &= ~KN_STAYQUEUED;
+ knote_dequeue(kn);
+ kqunlock(kn->kn_kq);
+}
int nfat_arch = 0, pr = 0, f = 0;
nfat_arch = OSSwapBigToHostInt32(fat_header->nfat_arch);
+
+ /* make sure bogus nfat_arch doesn't cause chaos - 19376072 */
+ if ( (sizeof(struct fat_header) + (nfat_arch * sizeof(struct fat_arch))) > PAGE_SIZE ) {
+ error = EBADEXEC;
+ goto bad;
+ }
+
/* Check each preference listed against all arches in header */
for (pr = 0; pr < NBINPREFS; pr++) {
cpu_type_t pref = psa->psa_binprefs[pr];
kdbg_trace_string(p, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4);
if (vfexec || spawn) {
- KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE,
+ KERNEL_DEBUG_CONSTANT1(TRACE_DATA_EXEC | DBG_FUNC_NONE,
p->p_pid ,0,0,0, (uintptr_t)thread_tid(thread));
- KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE,
+ KERNEL_DEBUG_CONSTANT1(TRACE_STRING_EXEC | DBG_FUNC_NONE,
dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, (uintptr_t)thread_tid(thread));
} else {
- KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE,
+ KERNEL_DEBUG_CONSTANT(TRACE_DATA_EXEC | DBG_FUNC_NONE,
p->p_pid ,0,0,0,0);
- KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE,
+ KERNEL_DEBUG_CONSTANT(TRACE_STRING_EXEC | DBG_FUNC_NONE,
dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0);
}
}
/* notify only if it has not failed due to FP Key error */
if ((p->p_lflag & P_LTERM_DECRYPTFAIL) == 0)
proc_knote(p, NOTE_EXEC);
- } else {
+ } else if (error == 0) {
/* reset the importance attribute from our previous life */
task_importance_reset(p->task);
#include <sys/sdt.h>
+extern boolean_t init_task_died;
extern char init_task_failure_data[];
void proc_prepareexit(proc_t p, int rv, boolean_t perf_notify);
void vfork_exit(proc_t p, int rv);
sync(p, (void *)NULL, (int *)NULL);
}
#endif
+ init_task_died = TRUE;
panic("%s died\nState at Last Exception:\n\n%s",
(p->p_comm[0] != '\0' ?
p->p_comm :
kauth_cred_t my_cred, my_new_cred;
posix_cred_t my_pcred;
-
uid = uap->uid;
+ /* get current credential and take a reference while we muck with it */
my_cred = kauth_cred_proc_ref(p);
my_pcred = posix_cred_get(my_cred);
DEBUG_CRED_ENTER("setuid (%d/%d): %p %d\n", p->p_pid, (p->p_pptr ? p->p_pptr->p_pid : 0), my_cred, uap->uid);
AUDIT_ARG(uid, uid);
- if (uid != my_pcred->cr_ruid && /* allow setuid(getuid()) */
- uid != my_pcred->cr_svuid && /* allow setuid(saved uid) */
- (error = suser(my_cred, &p->p_acflag))) {
- kauth_cred_unref(&my_cred);
- return (error);
- }
- /*
- * Everything's okay, do it.
- */
+ for (;;) {
+ if (uid != my_pcred->cr_ruid && /* allow setuid(getuid()) */
+ uid != my_pcred->cr_svuid && /* allow setuid(saved uid) */
+ (error = suser(my_cred, &p->p_acflag))) {
+ kauth_cred_unref(&my_cred);
+ return (error);
+ }
- /*
- * If we are priviledged, then set the saved and real UID too;
- * otherwise, just set the effective UID
- */
- if (suser(my_cred, &p->p_acflag) == 0) {
- svuid = uid;
- ruid = uid;
/*
- * Transfer proc count to new user.
- * chgproccnt uses list lock for protection
+ * If we are privileged, then set the saved and real UID too;
+ * otherwise, just set the effective UID
*/
- (void)chgproccnt(uid, 1);
- (void)chgproccnt(my_pcred->cr_ruid, -1);
- }
-
- /* get current credential and take a reference while we muck with it */
- for (;;) {
+ if (suser(my_cred, &p->p_acflag) == 0) {
+ svuid = uid;
+ ruid = uid;
+ } else {
+ svuid = KAUTH_UID_NONE;
+ ruid = KAUTH_UID_NONE;
+ }
/*
* Only set the gmuid if the current cred has not opt'ed out;
* this normally only happens when calling setgroups() instead
DEBUG_CRED_CHANGE("setuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags);
+ /*
+ * If we're changing the ruid from A to B, we might race with another thread that's setting ruid from B to A.
+ * The current locking mechanisms don't allow us to make the entire credential switch operation atomic,
+ * thus we may be able to change the process credentials from ruid A to B, but get preempted before incrementing the proc
+ * count of B. If a second thread sees the new process credentials and switches back to ruid A, that other thread
+ * may be able to decrement the proc count of B before we can increment it. This results in a panic.
+ * Incrementing the proc count of the target ruid, B, before setting the process credentials prevents this race.
+ */
+ if (ruid != KAUTH_UID_NONE) {
+ (void)chgproccnt(ruid, 1);
+ }
+
proc_lock(p);
/*
* We need to protect for a race where another thread
* also changed the credential after we took our
* reference. If p_ucred has changed then we should
* restart this again with the new cred.
+ *
+ * Note: the kauth_cred_setresuid has consumed a reference to my_cred, it p_ucred != my_cred, then my_cred must not be dereferenced!
*/
if (p->p_ucred != my_cred) {
proc_unlock(p);
+ /*
+ * We didn't successfully switch to the new ruid, so decrement
+ * the procs/uid count that we incremented above.
+ */
+ if (ruid != KAUTH_UID_NONE) {
+ (void)chgproccnt(ruid, -1);
+ }
kauth_cred_unref(&my_new_cred);
my_cred = kauth_cred_proc_ref(p);
+ my_pcred = posix_cred_get(my_cred);
/* try again */
continue;
}
OSBitOrAtomic(P_SUGID, &p->p_flag);
proc_unlock(p);
+ /*
+ * If we've updated the ruid, decrement the count of procs running
+ * under the previous ruid
+ */
+ if (ruid != KAUTH_UID_NONE) {
+ (void)chgproccnt(my_pcred->cr_ruid, -1);
+ }
}
break;
}
my_cred = kauth_cred_proc_ref(p);
my_pcred = posix_cred_get(my_cred);
- if (euid != my_pcred->cr_ruid && euid != my_pcred->cr_svuid &&
- (error = suser(my_cred, &p->p_acflag))) {
- kauth_cred_unref(&my_cred);
- return (error);
- }
-
- /*
- * Everything's okay, do it. Copy credentials so other references do
- * not see our changes. get current credential and take a reference
- * while we muck with it
- */
for (;;) {
+
+ if (euid != my_pcred->cr_ruid && euid != my_pcred->cr_svuid &&
+ (error = suser(my_cred, &p->p_acflag))) {
+ kauth_cred_unref(&my_cred);
+ return (error);
+ }
+
/*
* Set the credential with new info. If there is no change,
* we get back the same credential we passed in; if there is
proc_unlock(p);
kauth_cred_unref(&my_new_cred);
my_cred = kauth_cred_proc_ref(p);
+ my_pcred = posix_cred_get(my_cred);
/* try again */
continue;
}
my_cred = kauth_cred_proc_ref(p);
my_pcred = posix_cred_get(my_cred);
- if (((ruid != KAUTH_UID_NONE && /* allow no change of ruid */
- ruid != my_pcred->cr_ruid && /* allow ruid = ruid */
- ruid != my_pcred->cr_uid && /* allow ruid = euid */
- ruid != my_pcred->cr_svuid) || /* allow ruid = svuid */
- (euid != KAUTH_UID_NONE && /* allow no change of euid */
- euid != my_pcred->cr_uid && /* allow euid = euid */
- euid != my_pcred->cr_ruid && /* allow euid = ruid */
- euid != my_pcred->cr_svuid)) && /* allow euid = svui */
- (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
- kauth_cred_unref(&my_cred);
- return (error);
- }
-
- /*
- * Everything's okay, do it. Copy credentials so other references do
- * not see our changes. get current credential and take a reference
- * while we muck with it
- */
for (;;) {
+
+ if (((ruid != KAUTH_UID_NONE && /* allow no change of ruid */
+ ruid != my_pcred->cr_ruid && /* allow ruid = ruid */
+ ruid != my_pcred->cr_uid && /* allow ruid = euid */
+ ruid != my_pcred->cr_svuid) || /* allow ruid = svuid */
+ (euid != KAUTH_UID_NONE && /* allow no change of euid */
+ euid != my_pcred->cr_uid && /* allow euid = euid */
+ euid != my_pcred->cr_ruid && /* allow euid = ruid */
+ euid != my_pcred->cr_svuid)) && /* allow euid = svuid */
+ (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
+ kauth_cred_unref(&my_cred);
+ return (error);
+ }
+
uid_t new_euid;
- uid_t new_ruid;
uid_t svuid = KAUTH_UID_NONE;
new_euid = my_pcred->cr_uid;
- new_ruid = my_pcred->cr_ruid;
-
/*
* Set the credential with new info. If there is no change,
* we get back the same credential we passed in; if there is
* passed in. The subsequent compare is safe, because it is
* a pointer compare rather than a contents compare.
*/
- if (euid == KAUTH_UID_NONE && my_pcred->cr_uid != euid) {
+ if (euid != KAUTH_UID_NONE && my_pcred->cr_uid != euid) {
/* changing the effective UID */
new_euid = euid;
OSBitOrAtomic(P_SUGID, &p->p_flag);
}
- if (ruid != KAUTH_UID_NONE && my_pcred->cr_ruid != ruid) {
- /* changing the real UID; must do user accounting */
- /* chgproccnt uses list lock for protection */
- (void)chgproccnt(ruid, 1);
- (void)chgproccnt(my_pcred->cr_ruid, -1);
- new_ruid = ruid;
- OSBitOrAtomic(P_SUGID, &p->p_flag);
- }
/*
* If the newly requested real uid or effective uid does
* not match the saved uid, then set the saved uid to the
DEBUG_CRED_CHANGE("setreuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags);
+ /*
+ * If we're changing the ruid from A to B, we might race with another thread that's setting ruid from B to A.
+ * The current locking mechanisms don't allow us to make the entire credential switch operation atomic,
+ * thus we may be able to change the process credentials from ruid A to B, but get preempted before incrementing the proc
+ * count of B. If a second thread sees the new process credentials and switches back to ruid A, that other thread
+ * may be able to decrement the proc count of B before we can increment it. This results in a panic.
+ * Incrementing the proc count of the target ruid, B, before setting the process credentials prevents this race.
+ */
+ if (ruid != KAUTH_UID_NONE) {
+ (void)chgproccnt(ruid, 1);
+ }
+
proc_lock(p);
/*
* We need to protect for a race where another thread
* also changed the credential after we took our
* reference. If p_ucred has changed then we should
* restart this again with the new cred.
+ *
+ * Note: the kauth_cred_setresuid has consumed a reference to my_cred, it p_ucred != my_cred, then my_cred must not be dereferenced!
*/
if (p->p_ucred != my_cred) {
proc_unlock(p);
+ if (ruid != KAUTH_UID_NONE) {
+ /*
+ * We didn't successfully switch to the new ruid, so decrement
+ * the procs/uid count that we incremented above.
+ */
+ (void)chgproccnt(ruid, -1);
+ }
kauth_cred_unref(&my_new_cred);
my_cred = kauth_cred_proc_ref(p);
+ my_pcred = posix_cred_get(my_cred);
/* try again */
continue;
}
+
p->p_ucred = my_new_cred;
/* update cred on proc */
PROC_UPDATE_CREDS_ONPROC(p);
- OSBitOrAtomic(P_SUGID, &p->p_flag); /* XXX redundant? */
+ OSBitOrAtomic(P_SUGID, &p->p_flag);
proc_unlock(p);
+
+ if (ruid != KAUTH_UID_NONE) {
+ /*
+ * We switched to a new ruid, so decrement the count of procs running
+ * under the previous ruid
+ */
+ (void)chgproccnt(my_pcred->cr_ruid, -1);
+ }
}
break;
}
gid = uap->gid;
AUDIT_ARG(gid, gid);
+ /* get current credential and take a reference while we muck with it */
my_cred = kauth_cred_proc_ref(p);
my_pcred = posix_cred_get(my_cred);
- if (gid != my_pcred->cr_rgid && /* allow setgid(getgid()) */
- gid != my_pcred->cr_svgid && /* allow setgid(saved gid) */
- (error = suser(my_cred, &p->p_acflag))) {
- kauth_cred_unref(&my_cred);
- return (error);
- }
+ for (;;) {
+ if (gid != my_pcred->cr_rgid && /* allow setgid(getgid()) */
+ gid != my_pcred->cr_svgid && /* allow setgid(saved gid) */
+ (error = suser(my_cred, &p->p_acflag))) {
+ kauth_cred_unref(&my_cred);
+ return (error);
+ }
- /*
- * If we are priviledged, then set the saved and real GID too;
- * otherwise, just set the effective GID
- */
- if (suser(my_cred, &p->p_acflag) == 0) {
- svgid = gid;
- rgid = gid;
- }
+ /*
+ * If we are privileged, then set the saved and real GID too;
+ * otherwise, just set the effective GID
+ */
+ if (suser(my_cred, &p->p_acflag) == 0) {
+ svgid = gid;
+ rgid = gid;
+ } else {
+ svgid = KAUTH_GID_NONE;
+ rgid = KAUTH_GID_NONE;
+ }
- /* get current credential and take a reference while we muck with it */
- for (;;) {
-
/*
* Set the credential with new info. If there is no change,
* we get back the same credential we passed in; if there is
kauth_cred_unref(&my_new_cred);
/* try again */
my_cred = kauth_cred_proc_ref(p);
+ my_pcred = posix_cred_get(my_cred);
continue;
}
p->p_ucred = my_new_cred;
egid = uap->egid;
AUDIT_ARG(egid, egid);
+ /* get current credential and take a reference while we muck with it */
my_cred = kauth_cred_proc_ref(p);
my_pcred = posix_cred_get(my_cred);
- if (egid != my_pcred->cr_rgid &&
- egid != my_pcred->cr_svgid &&
- (error = suser(my_cred, &p->p_acflag))) {
- kauth_cred_unref(&my_cred);
- return (error);
- }
- /* get current credential and take a reference while we muck with it */
for (;;) {
+ if (egid != my_pcred->cr_rgid &&
+ egid != my_pcred->cr_svgid &&
+ (error = suser(my_cred, &p->p_acflag))) {
+ kauth_cred_unref(&my_cred);
+ return (error);
+ }
/*
* Set the credential with new info. If there is no change,
* we get back the same credential we passed in; if there is
kauth_cred_unref(&my_new_cred);
/* try again */
my_cred = kauth_cred_proc_ref(p);
+ my_pcred = posix_cred_get(my_cred);
continue;
}
p->p_ucred = my_new_cred;
AUDIT_ARG(egid, egid);
AUDIT_ARG(rgid, rgid);
+ /* get current credential and take a reference while we muck with it */
my_cred = kauth_cred_proc_ref(p);
my_pcred = posix_cred_get(my_cred);
- if (((rgid != KAUTH_UID_NONE && /* allow no change of rgid */
- rgid != my_pcred->cr_rgid && /* allow rgid = rgid */
- rgid != my_pcred->cr_gid && /* allow rgid = egid */
- rgid != my_pcred->cr_svgid) || /* allow rgid = svgid */
- (egid != KAUTH_UID_NONE && /* allow no change of egid */
- egid != my_pcred->cr_groups[0] && /* allow no change of egid */
- egid != my_pcred->cr_gid && /* allow egid = egid */
- egid != my_pcred->cr_rgid && /* allow egid = rgid */
- egid != my_pcred->cr_svgid)) && /* allow egid = svgid */
- (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
- kauth_cred_unref(&my_cred);
- return (error);
- }
-
- /* get current credential and take a reference while we muck with it */
for (;;) {
+
+ if (((rgid != KAUTH_UID_NONE && /* allow no change of rgid */
+ rgid != my_pcred->cr_rgid && /* allow rgid = rgid */
+ rgid != my_pcred->cr_gid && /* allow rgid = egid */
+ rgid != my_pcred->cr_svgid) || /* allow rgid = svgid */
+ (egid != KAUTH_UID_NONE && /* allow no change of egid */
+ egid != my_pcred->cr_groups[0] && /* allow no change of egid */
+ egid != my_pcred->cr_gid && /* allow egid = egid */
+ egid != my_pcred->cr_rgid && /* allow egid = rgid */
+ egid != my_pcred->cr_svgid)) && /* allow egid = svgid */
+ (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
+ kauth_cred_unref(&my_cred);
+ return (error);
+ }
+
uid_t new_egid = my_pcred->cr_gid;
uid_t new_rgid = my_pcred->cr_rgid;
uid_t svgid = KAUTH_UID_NONE;
* passed in. The subsequent compare is safe, because it is
* a pointer compare rather than a contents compare.
*/
- if (egid == KAUTH_UID_NONE && my_pcred->cr_gid != egid) {
+ if (egid != KAUTH_UID_NONE && my_pcred->cr_gid != egid) {
/* changing the effective GID */
new_egid = egid;
OSBitOrAtomic(P_SUGID, &p->p_flag);
kauth_cred_unref(&my_new_cred);
/* try again */
my_cred = kauth_cred_proc_ref(p);
+ my_pcred = posix_cred_get(my_cred);
continue;
}
p->p_ucred = my_new_cred;
/*
* Set the per-thread override identity. The first parameter can be the
- * current real UID, KAUTH_UID_NONE, or, if the caller is priviledged, it
+ * current real UID, KAUTH_UID_NONE, or, if the caller is privileged, it
* can be any UID. If it is KAUTH_UID_NONE, then as a special case, this
* means "revert to the per process credential"; otherwise, if permitted,
* it changes the effective, real, and saved UIDs and GIDs for the current
extern unsigned int vm_page_speculative_percentage;
extern unsigned int vm_page_speculative_q_age_ms;
+#if (DEVELOPMENT || DEBUG)
+extern uint32_t vm_page_creation_throttled_hard;
+extern uint32_t vm_page_creation_throttled_soft;
+#endif /* DEVELOPMENT || DEBUG */
+
/*
* Conditionally allow dtrace to see these functions for debugging purposes.
*/
SYSCTL_INT(_vm, OID_AUTO, vm_page_filecache_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_page_filecache_min, 0, "");
extern int vm_compressor_mode;
+extern int vm_compressor_is_active;
extern uint32_t swapout_target_age;
extern int64_t compressor_bytes_used;
extern uint32_t compressor_eval_period_in_msecs;
extern uint32_t vm_compressor_catchup_threshold_divisor;
SYSCTL_INT(_vm, OID_AUTO, compressor_mode, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_mode, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, compressor_is_active, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_is_active, 0, "");
SYSCTL_QUAD(_vm, OID_AUTO, compressor_bytes_used, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_bytes_used, "");
SYSCTL_INT(_vm, OID_AUTO, compressor_swapout_target_age, CTLFLAG_RD | CTLFLAG_LOCKED, &swapout_target_age, 0, "");
SYSCTL_INT(_vm, OID_AUTO, phantom_cache_thrashing_threshold_ssd, CTLFLAG_RW | CTLFLAG_LOCKED, &phantom_cache_thrashing_threshold_ssd, 0, "");
#endif
+#if (DEVELOPMENT || DEBUG)
+
+SYSCTL_UINT(_vm, OID_AUTO, vm_page_creation_throttled_hard,
+ CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
+ &vm_page_creation_throttled_hard, 0, "");
+
+SYSCTL_UINT(_vm, OID_AUTO, vm_page_creation_throttled_soft,
+ CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
+ &vm_page_creation_throttled_soft, 0, "");
+
+#endif /* DEVELOPMENT || DEBUG */
+
/*
* Enable tracing of voucher contents
*/
.csflags = 0,
.uuid = { 0 },
.min_vm_addr = MACH_VM_MAX_ADDRESS,
- .max_vm_addr = MACH_VM_MIN_ADDRESS
+ .max_vm_addr = MACH_VM_MIN_ADDRESS,
+ .cs_end_offset = 0
};
/*
} else {
got_code_signatures = TRUE;
}
+
+ if (got_code_signatures) {
+ boolean_t valid = FALSE, tainted = TRUE;
+ struct cs_blob *blobs;
+ vm_size_t off = 0;
+
+
+ if (cs_debug > 10)
+ printf("validating initial pages of %s\n", vp->v_name);
+ blobs = ubc_get_cs_blobs(vp);
+
+ while (off < size && ret == LOAD_SUCCESS) {
+ valid = cs_validate_page(blobs,
+ NULL,
+ file_offset + off,
+ addr + off,
+ &tainted);
+ if (!valid || tainted) {
+ if (cs_debug)
+ printf("CODE SIGNING: %s[%d]: invalid initial page at offset %lld validated:%d tainted:%d csflags:0x%x\n",
+ vp->v_name, p->p_pid, (long long)(file_offset + off), valid, tainted, result->csflags);
+ if (cs_enforcement(NULL) ||
+ (result->csflags & (CS_HARD|CS_KILL|CS_ENFORCEMENT))) {
+ ret = LOAD_FAILURE;
+ }
+ result->csflags &= ~CS_VALID;
+ }
+ off += PAGE_SIZE;
+ }
+ }
+
break;
#if CONFIG_CODE_DECRYPTION
case LC_ENCRYPTION_INFO:
if ((scp->fileoff & PAGE_MASK_64) != 0)
return (LOAD_BADMACHO);
+ /*
+ * If we have a code signature attached for this slice
+ * require that the segments are within the signed part
+ * of the file.
+ */
+ if (result->cs_end_offset &&
+ result->cs_end_offset < (off_t)scp->fileoff &&
+ result->cs_end_offset - scp->fileoff < scp->filesize)
+ {
+ if (cs_debug)
+ printf("section outside code signature\n");
+ return LOAD_BADMACHO;
+ }
+
/*
* Round sizes to page size.
*/
uint32_t size;
int flavor;
uint32_t thread_size;
+ uint32_t *local_ts;
+ uint32_t local_ts_size;
- ret = thread_state_initialize( thread );
- if (ret != KERN_SUCCESS) {
- return(LOAD_FAILURE);
- }
+ local_ts = NULL;
+ local_ts_size = 0;
+
+ ret = thread_state_initialize( thread );
+ if (ret != KERN_SUCCESS) {
+ ret = LOAD_FAILURE;
+ goto done;
+ }
+ if (total_size > 0) {
+ local_ts_size = total_size;
+ local_ts = kalloc(local_ts_size);
+ if (local_ts == NULL) {
+ ret = LOAD_FAILURE;
+ goto done;
+ }
+ memcpy(local_ts, ts, local_ts_size);
+ ts = local_ts;
+ }
+
/*
- * Set the new thread state; iterate through the state flavors in
- * the mach-o file.
+ * Set the new thread state; iterate through the state flavors in
+ * the mach-o file.
*/
while (total_size > 0) {
flavor = *ts++;
size = *ts++;
if (UINT32_MAX-2 < size ||
- UINT32_MAX/sizeof(uint32_t) < size+2)
- return (LOAD_BADMACHO);
+ UINT32_MAX/sizeof(uint32_t) < size+2) {
+ ret = LOAD_BADMACHO;
+ goto done;
+ }
thread_size = (size+2)*sizeof(uint32_t);
- if (thread_size > total_size)
- return(LOAD_BADMACHO);
+ if (thread_size > total_size) {
+ ret = LOAD_BADMACHO;
+ goto done;
+ }
total_size -= thread_size;
/*
* Third argument is a kernel space pointer; it gets cast
*/
ret = thread_setstatus(thread, flavor, (thread_state_t)ts, size);
if (ret != KERN_SUCCESS) {
- return(LOAD_FAILURE);
+ ret = LOAD_FAILURE;
+ goto done;
}
ts += size; /* ts is a (uint32_t *) */
}
- return(LOAD_SUCCESS);
+ ret = LOAD_SUCCESS;
+
+done:
+ if (local_ts != NULL) {
+ kfree(local_ts, local_ts_size);
+ local_ts = NULL;
+ }
+ return ret;
}
static
goto out;
}
- blob = ubc_cs_blob_get(vp, cputype, -1);
+ blob = ubc_cs_blob_get(vp, cputype, macho_offset);
if (blob != NULL) {
/* we already have a blob for this vnode and cputype */
if (blob->csb_cpu_type == cputype &&
ubc_cs_validation_bitmap_allocate( vp );
#endif
- blob = ubc_cs_blob_get(vp, cputype, -1);
+ blob = ubc_cs_blob_get(vp, cputype, macho_offset);
ret = LOAD_SUCCESS;
out:
if (ret == LOAD_SUCCESS) {
result->csflags |= blob->csb_flags;
result->platform_binary = blob->csb_platform_binary;
+ result->cs_end_offset = blob->csb_end_offset;
}
if (addr != 0) {
ubc_cs_blob_deallocate(addr, blob_size);
mach_vm_address_t min_vm_addr;
mach_vm_address_t max_vm_addr;
unsigned int platform_binary;
+ off_t cs_end_offset;
} load_result_t;
struct image_params;
--- /dev/null
+#!/usr/bin/python
+#
+# This script scans the trace.codes file, containing a mapping of event id to
+# event name for all events, and writes to stdout a C declaration for a table
+# named kd_events[] or these mappings.
+# Required to generate a header file used by DEVELOPMENT and DEBUG kernels.
+#
+
+import sys
+import re
+
+# we expect one arg specifying the path to the trace.codes file
+if (len(sys.argv) < 2):
+ exit(1)
+trace_code_file = sys.argv[1]
+
+# regular expression pattern to match <hex_id> <string>
+id_name_pattern = re.compile('0x([0-9a-fA-F]+)\s+([^\s]*)')
+code_table = []
+
+# scan file to generate internal table
+with open(trace_code_file, 'rt') as codes:
+ for line in codes:
+ m = id_name_pattern.match(line)
+ if m:
+ code_table += [(int(m.group(1),base=16), m.group(2))]
+
+# emit typedef:
+print "typedef struct {"
+print " uint32_t id;"
+print " const char *name;"
+print "} kd_event_t;"
+# emit structure declaration and sorted initialization:
+print "kd_event_t kd_events[] = {"
+for mapping in sorted(code_table, key=lambda x: x[0]):
+ print " {0x%x, \"%s\"}," % mapping
+print "};"
+
struct stat64 sb;
int error = 0;
+ bzero(&sb, sizeof(struct stat64));
context = vfs_context_create((vfs_context_t)0);
error = vn_stat(vp, &sb, NULL, 1, context);
(void)vfs_context_rele(context);
/* Handle input events */
if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP )) {
kev.filter = EVFILT_READ;
- if (!(events & ( POLLIN | POLLRDNORM )))
+ if (events & ( POLLPRI | POLLRDBAND ))
kev.flags |= EV_OOBAND;
kerror = kevent_register(kq, &kev, p);
}
struct poll_continue_args *cont = (struct poll_continue_args *)data;
struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
short prev_revents = fds->revents;
- short mask;
+ short mask = 0;
/* convert the results back into revents */
if (kevp->flags & EV_EOF)
if (fds->revents & POLLHUP)
mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
else {
- mask = (POLLIN | POLLRDNORM );
+ if ((kevp->flags & EV_ERROR) == 0 && kevp->data != 0)
+ mask = (POLLIN | POLLRDNORM );
if (kevp->flags & EV_OOBAND)
mask |= ( POLLPRI | POLLRDBAND );
}
0x1a2001c SFI_WAIT_CANCELED
0x1a20020 SFI_PID_SET_MANAGED
0x1a20024 SFI_PID_CLEAR_MANAGED
+0x1a20028 SFI_GLOBAL_DEFER
0x1a30004 ENERGY_PERF_GPU_DESCRIPTION
0x1a30008 ENERGY_PERF_GPU_TIME
0x2010000 L_IP_In_Beg
0x5310278 CPUPM_PST_UIB
0x531027C CPUPM_PST_PLIMIT_UIB
0x5310280 CPUPM_IO
+0x5310284 CPUPM_FI
0x5330000 HIBERNATE
0x5330004 HIBERNATE_WRITE_IMAGE
0x5330008 HIBERNATE_MACHINE_INIT
/*
- * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &soread_filtops;
+ /*
+ * If the caller explicitly asked for OOB results (e.g. poll()),
+ * save that off in the hookid field and reserve the kn_flags
+ * EV_OOBAND bit for output only).
+ */
+ if (kn->kn_flags & EV_OOBAND) {
+ kn->kn_flags &= ~EV_OOBAND;
+ kn->kn_hookid = EV_OOBAND;
+ } else {
+ kn->kn_hookid = 0;
+ }
skl = &so->so_rcv.sb_sel.si_note;
break;
case EVFILT_WRITE:
}
/* socket isn't a listener */
-
kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
+ /*
+ * Clear out EV_OOBAND that filt_soread may have set in the
+ * past.
+ */
+ kn->kn_flags &= ~EV_OOBAND;
- if (so->so_oobmark) {
- if (kn->kn_flags & EV_OOBAND) {
+ if ((so->so_oobmark) || (so->so_state & SS_RCVATMARK)){
+ kn->kn_flags |= EV_OOBAND;
+ /*
+ * If caller registered explicit interest in OOB data,
+ * return immediately (data == amount beyond mark, for
+ * legacy reasons - that should be changed later).
+ */
+ if (kn->kn_hookid == EV_OOBAND) {
+ /*
+ * When so_state is SS_RCVATMARK, so_oobmark
+ * is 0.
+ */
kn->kn_data -= so->so_oobmark;
if ((hint & SO_FILT_HINT_LOCKED) == 0)
socket_unlock(so, 1);
return (1);
}
- kn->kn_data = so->so_oobmark;
- kn->kn_flags |= EV_OOBAND;
- } else {
- if ((so->so_state & SS_CANTRCVMORE)
+ }
+
+ if ((so->so_state & SS_CANTRCVMORE)
#if CONTENT_FILTER
- && cfil_sock_data_pending(&so->so_rcv) == 0
+ && cfil_sock_data_pending(&so->so_rcv) == 0
#endif /* CONTENT_FILTER */
- ) {
- kn->kn_flags |= EV_EOF;
- kn->kn_fflags = so->so_error;
- if ((hint & SO_FILT_HINT_LOCKED) == 0)
- socket_unlock(so, 1);
- return (1);
- }
- }
-
- if (so->so_state & SS_RCVATMARK) {
- if (kn->kn_flags & EV_OOBAND) {
- if ((hint & SO_FILT_HINT_LOCKED) == 0)
- socket_unlock(so, 1);
- return (1);
- }
- kn->kn_flags |= EV_OOBAND;
- } else if (kn->kn_flags & EV_OOBAND) {
- kn->kn_data = 0;
+ ) {
+ kn->kn_flags |= EV_EOF;
+ kn->kn_fflags = so->so_error;
if ((hint & SO_FILT_HINT_LOCKED) == 0)
socket_unlock(so, 1);
- return (0);
+ return (1);
}
if (so->so_error) { /* temporary udp error */
if ((hint & SO_FILT_HINT_LOCKED) == 0)
socket_unlock(so, 1);
- return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
+ return (kn->kn_data >= lowwat);
}
static void
.\"
-.\" Copyright (c) 2008 Apple Inc. All rights reserved.
+.\" Copyright (c) 2008-2015 Apple Inc. All rights reserved.
.\"
.\" @APPLE_LICENSE_HEADER_START@
.\"
set this flag internally.
.It EV_EOF
Filters may set this flag to indicate filter-specific EOF condition.
+.It EV_OOBAND
+Read filter on socket may set this flag to indicate the presence of out of
+band data on the descriptor.
.It EV_ERROR
See
.Sx RETURN VALUES
.Va data
contains the number of bytes of protocol data available to read.
.Pp
+The presence of EV_OOBAND in
+.Va flags ,
+indicates the presence of out of band data on the socket
+.Va data
+equal to the potential number of OOB bytes availble to read.
+.Pp
If the read direction of the socket has shutdown, then the filter
also sets EV_EOF in
.Va flags ,
if (ret != KERN_SUCCESS) {
panic("filt_specdetach(): failed to unlink wait queue link.");
}
-
+ knote_clearstayqueued(kn);
(void)wait_queue_link_free(kn->kn_hook);
kn->kn_hook = NULL;
- kn->kn_status &= ~KN_STAYQUEUED;
}
static int
/*
- * Copyright (c) 2004-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
BRIDGE_LOCK_ASSERT_HELD(sc);
VERIFY(ifs != NULL);
+ /*
+ * First, remove the member from the list first so it cannot be found anymore
+ * when we release the bridge lock below
+ */
+ BRIDGE_XLOCK(sc);
+ TAILQ_REMOVE(&sc->sc_iflist, bif, bif_next);
+ BRIDGE_XDROP(sc);
+
if (!gone) {
switch (ifs->if_type) {
case IFT_ETHER:
/*
* Take the interface out of promiscuous mode.
*/
- if (bif->bif_flags & BIFF_PROMISC)
+ if (bif->bif_flags & BIFF_PROMISC) {
+ /*
+ * Unlock to prevent deadlock with bridge_iff_event() in
+ * case the driver generates an interface event
+ */
+ BRIDGE_UNLOCK(sc);
(void) ifnet_set_promiscuous(ifs, 0);
+ BRIDGE_LOCK(sc);
+ }
break;
case IFT_GIF:
bstp_disable(&bif->bif_stp);
#endif /* BRIDGESTP */
- BRIDGE_XLOCK(sc);
- TAILQ_REMOVE(&sc->sc_iflist, bif, bif_next);
- BRIDGE_XDROP(sc);
-
/*
* If removing the interface that gave the bridge its mac address, set
* the mac address of the bridge to the address of the next member, or
typedef __uint32_t n_time; /* ms since 00:00 GMT, byte rev */
#ifdef BSD_KERNEL_PRIVATE
-n_time iptime(void);
+u_int32_t iptime(void);
#endif /* BSD_KERNEL_PRIVATE */
#endif /* _NETINET_IN_SYSTM_H_ */
/*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
*/
struct icmpstat icmpstat;
-SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED,
- &icmpstat, icmpstat, "");
+SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats,
+ CTLFLAG_RD | CTLFLAG_LOCKED,
+ &icmpstat, icmpstat, "");
static int icmpmaskrepl = 0;
-SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW | CTLFLAG_LOCKED,
- &icmpmaskrepl, 0, "");
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl,
+ CTLFLAG_RW | CTLFLAG_LOCKED,
+ &icmpmaskrepl, 0, "");
static int icmptimestamp = 0;
-SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp, CTLFLAG_RW | CTLFLAG_LOCKED,
- &icmptimestamp, 0, "");
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp,
+ CTLFLAG_RW | CTLFLAG_LOCKED,
+ &icmptimestamp, 0, "");
-static int drop_redirect = 0;
-SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW | CTLFLAG_LOCKED,
- &drop_redirect, 0, "");
+static int drop_redirect = 1;
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect,
+ CTLFLAG_RW | CTLFLAG_LOCKED,
+ &drop_redirect, 0, "");
static int log_redirect = 0;
-SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW | CTLFLAG_LOCKED,
- &log_redirect, 0, "");
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect,
+ CTLFLAG_RW | CTLFLAG_LOCKED,
+ &log_redirect, 0, "");
+
+static int icmp_datalen = 8;
#if ICMP_BANDLIM
struct mbuf *n,
int type,
int code,
- n_long dest,
+ u_int32_t dest,
u_int32_t nextmtu)
{
- struct ip *oip = mtod(n, struct ip *), *nip;
- unsigned oiplen;
+ struct ip *oip, *nip;
struct icmp *icp;
struct mbuf *m;
- unsigned icmplen;
+ u_int32_t oiphlen, icmplen, icmpelen, nlen;
/* Expect 32-bit aligned data pointer on strict-align platforms */
MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(n);
- oiplen = IP_VHL_HL(oip->ip_vhl) << 2;
+ oip = mtod(n, struct ip *);
+ oiphlen = IP_VHL_HL(oip->ip_vhl) << 2;
#if ICMPPRINTFS
if (icmpprintfs)
* Don't error if the old packet protocol was ICMP
* error message, only known informational types.
*/
- if (oip->ip_off &~ (IP_MF|IP_DF))
+ if (oip->ip_off & ~(IP_MF|IP_DF))
goto freeit;
+
if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
- n->m_len >= oiplen + ICMP_MINLEN &&
- !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiplen))->
+ n->m_len >= oiphlen + ICMP_MINLEN &&
+ !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiphlen))->
icmp_type)) {
icmpstat.icps_oldicmp++;
goto freeit;
}
- /* Don't send error in response to a multicast or broadcast packet */
+ /*
+ * Don't send error in response to a multicast or
+ * broadcast packet
+ */
if (n->m_flags & (M_BCAST|M_MCAST))
goto freeit;
+
+ /*
+ * Calculate the length to quote from original packet and prevent
+ * the ICMP mbuf from overflowing.
+ */
+ nlen = m_length(n);
+ if (oip->ip_p == IPPROTO_TCP) {
+ struct tcphdr *th;
+ u_int16_t tcphlen;
+
+ if (oiphlen + sizeof(struct tcphdr) > n->m_len &&
+ n->m_next == NULL)
+ goto stdreply;
+ if (n->m_len < (oiphlen + sizeof(struct tcphdr)) &&
+ (n = m_pullup(n, (oiphlen + sizeof(struct tcphdr)))) == NULL)
+ goto freeit;
+
+ th = (struct tcphdr *)(void *)((caddr_t)oip + oiphlen);
+ if (th != ((struct tcphdr *)P2ROUNDDOWN(th,
+ sizeof(u_int32_t))))
+ goto freeit;
+ tcphlen = th->th_off << 2;
+ if (tcphlen < sizeof(struct tcphdr))
+ goto freeit;
+ if (oip->ip_len < (oiphlen + tcphlen))
+ goto freeit;
+ if ((oiphlen + tcphlen) > n->m_len && n->m_next == NULL)
+ goto stdreply;
+ if (n->m_len < (oiphlen + tcphlen) &&
+ (n = m_pullup(n, (oiphlen + tcphlen))) == NULL)
+ goto freeit;
+
+ icmpelen = max(tcphlen, min(icmp_datalen,
+ (oip->ip_len - oiphlen)));
+ } else
+stdreply: icmpelen = max(ICMP_MINLEN, min(icmp_datalen,
+ (ntohs(oip->ip_len) - oiphlen)));
+
+ icmplen = min(oiphlen + icmpelen, min(nlen, oip->ip_len));
+ if (icmplen < sizeof(struct ip))
+ goto freeit;
/*
* First, formulate icmp message
*/
- m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */
+ if (MHLEN > (sizeof(struct ip) + ICMP_MINLEN + icmplen))
+ m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */
+ else
+ m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+
if (m == NULL)
goto freeit;
- if (n->m_flags & M_SKIP_FIREWALL) {
- /* set M_SKIP_FIREWALL to skip firewall check, since we're called from firewall */
+ if (n->m_flags & M_SKIP_FIREWALL) {
+ /*
+ * set M_SKIP_FIREWALL to skip firewall check, since
+ * we're called from firewall
+ */
m->m_flags |= M_SKIP_FIREWALL;
}
#if CONFIG_MACF_NET
mac_mbuf_label_associate_netlayer(n, m);
#endif
- icmplen = min(oiplen + 8, oip->ip_len);
- if (icmplen < sizeof(struct ip)) {
- printf("icmp_error: bad length\n");
- m_free(m);
- goto freeit;
- }
- m->m_len = icmplen + ICMP_MINLEN;
+ m->m_len = icmplen + ICMP_MINLEN; /* for ICMP header and data */
MH_ALIGN(m, m->m_len);
icp = mtod(m, struct icmp *);
- if ((u_int)type > ICMP_MAXTYPE)
- panic("icmp_error");
+ if ((u_int)type > ICMP_MAXTYPE) {
+ m_freem(m);
+ goto freeit;
+ }
icmpstat.icps_outhist[type]++;
icp->icmp_type = type;
if (type == ICMP_REDIRECT)
* Now, copy old ip header (without options)
* in front of icmp message.
*/
- if (m->m_data - sizeof(struct ip) < m->m_pktdat)
- panic("icmp len");
+ if (m->m_data - sizeof(struct ip) < m->m_pktdat) {
+ m_freem(m);
+ goto freeit;
+ }
m->m_data -= sizeof(struct ip);
m->m_len += sizeof(struct ip);
m->m_pkthdr.len = m->m_len;
nip->ip_vhl = IP_VHL_BORING;
nip->ip_p = IPPROTO_ICMP;
nip->ip_tos = 0;
+ nip->ip_off = 0;
icmp_reflect(m);
freeit:
ROUTE_RELEASE(&ro);
}
-n_time
+u_int32_t
iptime(void)
{
struct timeval atv;
struct in_ifaddr *ia;
int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
struct in_addr *sin, dst;
- n_time ntime;
+ u_int32_t ntime;
struct sockaddr_in ipaddr = {
sizeof (ipaddr), AF_INET, 0, { 0 }, { 0, } };
}
return (0);
bad:
- /* XXX icmp_error adds in hdr length */
- ip->ip_len -= IP_VHL_HL(ip->ip_vhl) << 2;
icmp_error(m, type, code, 0, 0);
OSAddAtomic(1, &ipstat.ips_badoptions);
return (1);
float K, var;
u_int32_t elapsed_time, win;
- VERIFY(tp->t_ccstate->cub_last_max > 0);
win = min(tp->snd_cwnd, tp->snd_wnd);
+ if (tp->t_ccstate->cub_last_max == 0)
+ tp->t_ccstate->cub_last_max = tp->snd_ssthresh;
+
if (tp->t_ccstate->cub_epoch_start == 0) {
/*
* This is the beginning of a new epoch, initialize some of
#ifdef PRIVATE
struct tcp_debug {
- n_time td_time;
+ u_int32_t td_time;
short td_act;
short td_ostate;
caddr_t td_tcb;
/*
- * Copyright (c) 2008-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
u_int32_t rip6_recvspace = RIPV6RCVQ;
/* ICMPV6 parameters */
-int icmp6_rediraccept = 1; /* accept and process redirects */
+int icmp6_rediraccept = 0; /* accept and process redirects */
int icmp6_redirtimeout = 10 * 60; /* 10 minutes */
int icmp6errppslim = 500; /* 500 packets per second */
int icmp6rappslim = 10; /* 10 packets per second */
/*
- * Copyright (c) 2003-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
}
ip6stat.ip6s_nxthist[ip6->ip6_nxt]++;
-
-#if IPFW2
- /*
- * Check with the firewall...
- */
- if (ip6_fw_enable && ip6_fw_chk_ptr) {
- u_short port = 0;
- /* If ipfw says divert, we have to just drop packet */
- /* use port as a dummy argument */
- if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) {
- m_freem(m);
- m = NULL;
- }
- if (!m)
- goto done;
- }
-#endif /* IPFW2 */
-
/*
* Check against address spoofing/corruption.
*/
+ if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP) &&
+ IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src)) {
+ ip6stat.ip6s_badscope++;
+ in6_ifstat_inc(inifp, ifs6_in_addrerr);
+ goto bad;
+ }
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
/*
goto bad;
}
#endif
+#if IPFW2
+ /*
+ * Check with the firewall...
+ */
+ if (ip6_fw_enable && ip6_fw_chk_ptr) {
+ u_short port = 0;
+ /* If ipfw says divert, we have to just drop packet */
+ /* use port as a dummy argument */
+ if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) {
+ m_freem(m);
+ m = NULL;
+ }
+ if (!m)
+ goto done;
+ }
+#endif /* IPFW2 */
/*
* Naively assume we can attribute inbound data to the route we would
/*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
}
}
- if (req->r_achain.tqe_next == NFSREQNOLIST)
+ if (req->r_achain.tqe_next == NFSREQNOLIST || req->r_achain.tqe_next == NFSIODCOMPLETING)
TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
/* If this mount doesn't already have an nfsiod working on it... */
/*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
nfs4_mount_callback_shutdown(nmp);
/* Destroy any RPCSEC_GSS contexts */
- if (!TAILQ_EMPTY(&nmp->nm_gsscl))
- nfs_gss_clnt_ctx_unmount(nmp);
+ nfs_gss_clnt_ctx_unmount(nmp);
/* mark the socket for termination */
lck_mtx_lock(&nmp->nm_lock);
vnode_internal.h proc_internal.h file_internal.h mount_internal.h \
uio_internal.h tree.h munge.h kern_tests.h
-EXPORT_MI_GEN_LIST = syscall.h sysproto.h
+EXPORT_MI_GEN_LIST = syscall.h sysproto.h kdebugevents.h
EXPORT_MI_DIR = sys
# /System/Library/Frameworks/Kernel.framework/Headers
INSTALL_KF_MI_LIST = ${KERNELFILES}
-INSTALL_KF_MI_GEN_LIST =
+INSTALL_KF_MI_GEN_LIST =
MAKESYSCALLS = $(SRCROOT)/bsd/kern/makesyscalls.sh
+MAKEKDEBUGEVENTS = $(SRCROOT)/bsd/kern/makekdebugevents.py
$(OBJROOT)/cscope.genhdrs:
$(_v)mkdir -p $(OBJROOT)/cscope.genhdrs
@echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path
$(_v)$(MAKESYSCALLS) $< proto > /dev/null
+kdebugevents.h: $(SRCROOT)/bsd/kern/trace.codes $(MAKEKDEBUGEVENTS) $(OBJROOT)/cscope.genhdrs
+ @echo "Generating bsd/kern/$@ from $<";
+ @echo "$(OBJPATH)/bsd/kern/$@" > $(OBJROOT)/cscope.genhdrs/$@.path
+ $(_v)$(MAKEKDEBUGEVENTS) $< > "$(OBJPATH)/bsd/sys/$@"
+
MAKE_POSIX_AVAILABILITY = $(SRCROOT)/bsd/sys/make_posix_availability.sh
_posix_availability.h: $(MAKE_POSIX_AVAILABILITY)
@echo "Generating bsd/sys/$@"
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Portions Copyright (c) 2012 by Delphix. All rights reserved.
*/
#ifndef _SYS_DTRACE_H
#define S_ROUND(x, a) ((x) + (((a) ? (a) : 1) - 1) & ~(((a) ? (a) : 1) - 1))
#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
+#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align)))
#define CTF_MODEL_ILP32 1 /* object data model is ILP32 */
#define CTF_MODEL_LP64 2 /* object data model is LP64 */
* DTrace Metadata Description Structures
*
* DTrace separates the trace data stream from the metadata stream. The only
- * metadata tokens placed in the data stream are enabled probe identifiers
- * (EPIDs) or (in the case of aggregations) aggregation identifiers. In order
- * to determine the structure of the data, DTrace consumers pass the token to
- * the kernel, and receive in return a corresponding description of the enabled
+ * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID +
+ * timestamp) or (in the case of aggregations) aggregation identifiers. To
+ * determine the structure of the data, DTrace consumers pass the token to the
+ * kernel, and receive in return a corresponding description of the enabled
* probe (via the dtrace_eprobedesc structure) or the aggregation (via the
* dtrace_aggdesc structure). Both of these structures are expressed in terms
* of record descriptions (via the dtrace_recdesc structure) that describe the
#define DTRACEOPT_AGGHIST 27 /* histogram aggregation output */
#define DTRACEOPT_AGGPACK 28 /* packed aggregation output */
#define DTRACEOPT_AGGZOOM 29 /* zoomed aggregation scaling */
+#define DTRACEOPT_TEMPORAL 30 /* temporally ordered output */
#if !defined(__APPLE__)
-#define DTRACEOPT_MAX 30 /* number of options */
-#else
-#define DTRACEOPT_STACKSYMBOLS 30 /* clear to prevent stack symbolication */
#define DTRACEOPT_MAX 31 /* number of options */
+#else
+#define DTRACEOPT_STACKSYMBOLS 31 /* clear to prevent stack symbolication */
+#define DTRACEOPT_MAX 32 /* number of options */
#endif /* __APPLE__ */
#define DTRACEOPT_UNSET (dtrace_optval_t)-2 /* unset option */
* where user-level wishes the kernel to snapshot the buffer to (the
* dtbd_data field). The kernel uses the same structure to pass back some
* information regarding the buffer: the size of data actually copied out, the
- * number of drops, the number of errors, and the offset of the oldest record.
+ * number of drops, the number of errors, the offset of the oldest record,
+ * and the time of the snapshot.
+ *
* If the buffer policy is a "switch" policy, taking a snapshot of the
* principal buffer has the additional effect of switching the active and
* inactive buffers. Taking a snapshot of the aggregation buffer _always_ has
uint64_t dtbd_drops; /* number of drops */
DTRACE_PTR(char, dtbd_data); /* data */
uint64_t dtbd_oldest; /* offset of oldest record */
+ uint64_t dtbd_timestamp; /* hrtime of snapshot */
} dtrace_bufdesc_t;
+/*
+ * Each record in the buffer (dtbd_data) begins with a header that includes
+ * the epid and a timestamp. The timestamp is split into two 4-byte parts
+ * so that we do not require 8-byte alignment.
+ */
+typedef struct dtrace_rechdr {
+ dtrace_epid_t dtrh_epid; /* enabled probe id */
+ uint32_t dtrh_timestamp_hi; /* high bits of hrtime_t */
+ uint32_t dtrh_timestamp_lo; /* low bits of hrtime_t */
+} dtrace_rechdr_t;
+
+#define DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) \
+ ((dtrh)->dtrh_timestamp_lo + \
+ ((uint64_t)(dtrh)->dtrh_timestamp_hi << 32))
+
+#define DTRACE_RECORD_STORE_TIMESTAMP(dtrh, hrtime) { \
+ (dtrh)->dtrh_timestamp_lo = (uint32_t)hrtime; \
+ (dtrh)->dtrh_timestamp_hi = hrtime >> 32; \
+}
+
/*
* DTrace Status
*
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Portions Copyright (c) 2012 by Delphix. All rights reserved.
*/
#ifndef _SYS_DTRACE_IMPL_H
* predicate is non-NULL, the DIF object is executed. If the result is
* non-zero, the action list is processed, with each action being executed
* accordingly. When the action list has been completely executed, processing
- * advances to the next ECB. processing advances to the next ECB. If the
- * result is non-zero; For each ECB, it first determines the The ECB
- * abstraction allows disjoint consumers to multiplex on single probes.
+ * advances to the next ECB. The ECB abstraction allows disjoint consumers
+ * to multiplex on single probes.
+ *
+ * Execution of the ECB results in consuming dte_size bytes in the buffer
+ * to record data. During execution, dte_needed bytes must be available in
+ * the buffer. This space is used for both recorded data and tuple data.
*/
struct dtrace_ecb {
dtrace_epid_t dte_epid; /* enabled probe ID */
uint32_t dte_alignment; /* required alignment */
- size_t dte_needed; /* bytes needed */
- size_t dte_size; /* total size of payload */
+ size_t dte_needed; /* space needed for execution */
+ size_t dte_size; /* size of recorded payload */
dtrace_predicate_t *dte_predicate; /* predicate, if any */
dtrace_action_t *dte_action; /* actions, if any */
dtrace_ecb_t *dte_next; /* next ECB on probe */
* the EPID, the consumer can determine the data layout. (The data buffer
* layout is shown schematically below.) By assuring that one can determine
* data layout from the EPID, the metadata stream can be separated from the
- * data stream -- simplifying the data stream enormously.
- *
- * base of data buffer ---> +------+--------------------+------+
- * | EPID | data | EPID |
- * +------+--------+------+----+------+
- * | data | EPID | data |
- * +---------------+------+-----------+
- * | data, cont. |
- * +------+--------------------+------+
- * | EPID | data | |
- * +------+--------------------+ |
- * | || |
- * | || |
- * | \/ |
- * : :
- * . .
- * . .
- * . .
- * : :
- * | |
- * limit of data buffer ---> +----------------------------------+
+ * data stream -- simplifying the data stream enormously. The ECB always
+ * proceeds the recorded data as part of the dtrace_rechdr_t structure that
+ * includes the EPID and a high-resolution timestamp used for output ordering
+ * consistency.
+ *
+ * base of data buffer ---> +--------+--------------------+--------+
+ * | rechdr | data | rechdr |
+ * +--------+------+--------+----+--------+
+ * | data | rechdr | data |
+ * +---------------+--------+-------------+
+ * | data, cont. |
+ * +--------+--------------------+--------+
+ * | rechdr | data | |
+ * +--------+--------------------+ |
+ * | || |
+ * | || |
+ * | \/ |
+ * : :
+ * . .
+ * . .
+ * . .
+ * : :
+ * | |
+ * limit of data buffer ---> +--------------------------------------+
*
* When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the
* principal buffer (both scratch and payload) exceed the available space. If
#ifndef _LP64
uint64_t dtb_pad1;
#endif
+ uint64_t dtb_switched; /* time of last switch */
+ uint64_t dtb_interval; /* observed switch interval */
} dtrace_buffer_t;
/*
/*
- * Copyright (c) 2003-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* (which always returns true for regular files - regardless of the amount
* of unread data in the file).
*
- * On input, EV_OOBAND specifies that only OOB data should be looked for.
- * The returned data count is the number of bytes beyond the current OOB marker.
+ * On input, EV_OOBAND specifies that filter should actively return in the
+ * presence of OOB on the descriptor. It implies that filter will return
+ * if there is OOB data available to read OR when any other condition
+ * for the read are met (for example number of bytes regular data becomes >=
+ * low-watermark).
+ * If EV_OOBAND is not set on input, it implies that the filter should not actively
+ * return for out of band data on the descriptor. The filter will then only return
+ * when some other condition for read is met (ex: when number of regular data bytes
+ * >=low-watermark OR when socket can't receive more data (SS_CANTRCVMORE)).
*
- * On output, EV_OOBAND indicates that OOB data is present
+ * On output, EV_OOBAND indicates the presence of OOB data on the descriptor.
* If it was not specified as an input parameter, then the data count is the
- * number of bytes before the current OOB marker. If at the marker, the
- * data count indicates the number of bytes available after it. In either
- * case, it's the amount of data one could expect to receive next.
+ * number of bytes before the current OOB marker, else data count is the number
+ * of bytes beyond OOB marker.
*/
#define EV_POLL EV_FLAG0
#define EV_OOBAND EV_FLAG1
extern int knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t *wqlp);
extern void knote_fdclose(struct proc *p, int fd);
extern void knote_markstayqueued(struct knote *kn);
-
+extern void knote_clearstayqueued(struct knote *kn);
#endif /* !KERNEL_PRIVATE */
#else /* KERNEL */
/* Codes for Selective Forced Idle (DBG_MACH_SFI) */
#define SFI_SET_WINDOW 0x0
#define SFI_CANCEL_WINDOW 0x1
-#define SFI_SET_CLASS_OFFTIME 0x2
+#define SFI_SET_CLASS_OFFTIME 0x2
#define SFI_CANCEL_CLASS_OFFTIME 0x3
#define SFI_THREAD_DEFER 0x4
#define SFI_OFF_TIMER 0x5
#define SFI_ON_TIMER 0x6
#define SFI_WAIT_CANCELED 0x7
#define SFI_PID_SET_MANAGED 0x8
-#define SFI_PID_CLEAR_MANAGED 0x9
-
+#define SFI_PID_CLEAR_MANAGED 0x9
+#define SFI_GLOBAL_DEFER 0xa
/* **** The Kernel Debug Sub Classes for Network (DBG_NETWORK) **** */
#define DBG_NETIP 1 /* Internet Protocol */
#define DBG_NETARP 2 /* Address Resolution Protocol */
#define DBG_TRACE_STRING 1
#define DBG_TRACE_INFO 2
-/*
- * TRACE_DATA_NEWTHREAD 0x1
- * TRACE_DATA_EXEC 0x2
- */
-#define TRACE_DATA_THREAD_TERMINATE 0x3 /* thread has been queued for deallocation and can no longer run */
+/* The Kernel Debug events: */
+#define TRACE_DATA_NEWTHREAD (TRACEDBG_CODE(DBG_TRACE_DATA, 1))
+#define TRACE_DATA_EXEC (TRACEDBG_CODE(DBG_TRACE_DATA, 2))
+#define TRACE_DATA_THREAD_TERMINATE (TRACEDBG_CODE(DBG_TRACE_DATA, 3))
+#define TRACE_STRING_NEWTHREAD (TRACEDBG_CODE(DBG_TRACE_STRING, 1))
+#define TRACE_STRING_EXEC (TRACEDBG_CODE(DBG_TRACE_STRING, 2))
+#define TRACE_PANIC (TRACEDBG_CODE(DBG_TRACE_INFO, 0))
+#define TRACE_TIMESTAMPS (TRACEDBG_CODE(DBG_TRACE_INFO, 1))
+#define TRACE_LOST_EVENTS (TRACEDBG_CODE(DBG_TRACE_INFO, 2))
+#define TRACE_WRITING_EVENTS (TRACEDBG_CODE(DBG_TRACE_INFO, 3))
+#define TRACE_INFO_STRING (TRACEDBG_CODE(DBG_TRACE_INFO, 4))
/* The Kernel Debug Sub Classes for DBG_CORESTORAGE */
#define DBG_CS_IO 0
#define KDEBUG_ENABLE_ENTROPY 0x2 /* Obsolescent */
#define KDEBUG_ENABLE_CHUD 0x4
#define KDEBUG_ENABLE_PPT 0x8
+#define KDEBUG_ENABLE_SERIAL 0x10
/*
* Infer the supported kernel debug event level from config option.
/* Minimum value allowed when setting decrementer ticks */
#define KDBG_MINRTCDEC 2500
+/* VFS lookup events for serial traces */
+#define VFS_LOOKUP (FSDBG_CODE(DBG_FSRW,36))
+#define VFS_LOOKUP_DONE (FSDBG_CODE(DBG_FSRW,39))
+
+#if (DEVELOPMENT || DEBUG)
+#define KDEBUG_MOJO_TRACE 1
+#endif
+
#endif /* __APPLE_API_PRIVATE */
#endif /* PRIVATE */
/*
- * Copyright (c) 2004-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2014 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
VATTR_WANTED(&va, va_mode);
VATTR_WANTED(&va, va_uid);
VATTR_WANTED(&va, va_gid);
+ VATTR_WANTED(&va, va_nlink);
if ((ret = vnode_getattr(vp, &va, vfs_context_kernel())) != 0) {
// printf("add_fsevent: failed to getattr on vp %p (%d)\n", cur->fref.vp, ret);
cur->str = NULL;
cur->mode = (int32_t)vnode_vttoif(vnode_vtype(vp)) | va.va_mode;
cur->uid = va.va_uid;
cur->gid = va.va_gid;
+ if (vp->v_flag & VISHARDLINK) {
+ cur->mode |= FSE_MODE_HLINK;
+ if ((vp->v_type == VDIR && va.va_dirlinkcount == 0) || (vp->v_type == VREG && va.va_nlink == 0)) {
+ cur->mode |= FSE_MODE_LAST_HLINK;
+ }
+ }
// if we haven't gotten the path yet, get it.
if (pathbuff == NULL) {
* entries, we must mark the start of the path's string and the end.
*/
if (lookup == TRUE)
- code = (FSDBG_CODE(DBG_FSRW,36)) | DBG_FUNC_START;
+ code = VFS_LOOKUP | DBG_FUNC_START;
else
- code = (FSDBG_CODE(DBG_FSRW,39)) | DBG_FUNC_START;
+ code = VFS_LOOKUP_DONE | DBG_FUNC_START;
if (dbg_namelen <= (int)(3 * sizeof(long)))
code |= DBG_FUNC_END;
is64bit = proc_is64bit(p);
memp = NULL;
+
+ /*
+ * ensure the buffer is large enough for underlying calls
+ */
+#ifndef HFSIOC_GETPATH
+typedef char pn_t[MAXPATHLEN];
+#define HFSIOC_GETPATH _IOWR('h', 13, pn_t)
+#endif
+
+#ifndef HFS_GETPATH
+#define HFS_GETPATH IOCBASECMD(HFSIOC_GETPATH)
+#endif
+ if (IOCBASECMD(cmd) == HFS_GETPATH) {
+ /* Round up to MAXPATHLEN regardless of user input */
+ size = MAXPATHLEN;
+ }
+
+
if (size > sizeof (stkbuf)) {
if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
data = memp;
-14.1.0
+14.3.0
# The first line of this file contains the master version number for the kernel.
# All other instances of the kernel version in xnu are derived from this file.
_buf_kernel_addrperm_addr
_buf_setfilter
_buf_shadow
+_bufattr_alloc
+_bufattr_dup
+_bufattr_free
+_bufattr_greedymode
+_bufattr_isochronous
+_bufattr_markgreedymode
+_bufattr_markisochronous
+_bufattr_markmeta
+_bufattr_markquickcomplete
_bufattr_meta
_bufattr_nocache
-_bufattr_throttled
_bufattr_passive
+_bufattr_quickcomplete
+_bufattr_throttled
_cdevsw
_cdevsw_setkqueueok
_chudxnu_platform_ptr
--- /dev/null
+*.pbxuser
+*.perspectivev3
+build/
if (kr != KERN_SUCCESS) {
break;
}
+ } else {
+ kr = KERN_INVALID_TASK;
+ break;
}
/* Increment sync value. */
*/
next->mailbox = mailbox;
lck_mtx_unlock(&atm_value->listener_lock);
- KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_REPLACED))) | DBG_FUNC_NONE,
- atm_value, atm_value->aid, mailbox_offset, 0, 0);
+ KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_REPLACED))) | DBG_FUNC_NONE,
+ VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0);
/* Drop the extra reference on task descriptor taken by this function. */
atm_task_descriptor_dealloc(task_descriptor);
return KERN_SUCCESS;
}
}
- KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_ADDED))) | DBG_FUNC_NONE,
- atm_value, atm_value->aid, mailbox_offset, 0, 0);
+ KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_ADDED))) | DBG_FUNC_NONE,
+ VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0);
queue_enter(&atm_value->listeners, new_link_object, atm_link_object_t, listeners_element);
atm_value->listener_count++;
if (elem->descriptor == task_descriptor) {
if (elem->mailbox == mailbox) {
- KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_UNREGISTER_INFO,
+ KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO,
(ATM_VALUE_UNREGISTERED))) | DBG_FUNC_NONE,
- atm_value, atm_value->aid, mailbox_offset, 0, 0);
+ VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0);
queue_remove(&atm_value->listeners, elem, atm_link_object_t, listeners_element);
queue_enter(&free_listeners, elem, atm_link_object_t, listeners_element);
atm_value->listener_count--;
kr = KERN_SUCCESS;
break;
} else {
- KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_UNREGISTER_INFO,
+ KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO,
(ATM_VALUE_DIFF_MAILBOX))) | DBG_FUNC_NONE,
- atm_value, atm_value->aid, 0, 0, 0);
+ VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, 0, 0, 0);
kr = KERN_INVALID_VALUE;
break;
}
atm_value_t atm_value;
uint32_t i;
- KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_START,
+ KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_START,
0, 0, 0, 0, 0);
for (i = 0; i < count; i++) {
atm_value_dealloc(atm_value);
}
- KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_END,
+ KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_END,
count, 0, 0, 0, 0);
}
atm_link_object_t next, elem;
queue_head_t free_listeners;
- KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_START,
+ KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_START,
0, 0, 0, 0, 0);
lck_mtx_lock(&atm_value->listener_lock);
atm_link_dealloc(next);
}
- KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_END,
+ KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_END,
j, freed_count, dead_but_not_freed, 0, 0);
/* explicitly upgrade uint32_t to 64 bit mach size */
#include <device/device_port.h>
ipc_port_t master_device_port;
+void *master_device_kobject;
lck_grp_attr_t * dev_lck_grp_attr;
lck_grp_t * dev_lck_grp;
if (master_device_port == IP_NULL)
panic("can't allocate master device port");
- ipc_kobject_set(master_device_port, 1, IKOT_MASTER_DEVICE);
- kernel_set_special_port(host_priv_self(), HOST_IO_MASTER_PORT,
+ ipc_kobject_set(master_device_port, (ipc_kobject_t)&master_device_kobject, IKOT_MASTER_DEVICE);
+ kernel_set_special_port(host_priv_self(), HOST_IO_MASTER_PORT,
ipc_port_make_send(master_device_port));
/* allocate device lock group attribute and group */
#include <mach/vm_prot.h>
#include <mach/machine.h>
#include <mach/time_value.h>
+#include <sys/kdebug.h>
#include <kern/spl.h>
#include <kern/assert.h>
#include <kern/debug.h>
#include <architecture/i386/pio.h> /* inb() */
#include <pexpert/i386/boot.h>
+#include <kdp/kdp_dyld.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
#endif
static void machine_conf(void);
+void panic_print_symbol_name(vm_address_t search);
+extern boolean_t init_task_died;
+extern const char version[];
+extern char osversion[];
extern int max_unsafe_quanta;
extern int max_poll_quanta;
extern unsigned int panic_is_inited;
+extern int proc_pid(void *p);
+
+/* Definitions for frame pointers */
+#define FP_ALIGNMENT_MASK ((uint32_t)(0x3))
+#define FP_LR_OFFSET ((uint32_t)4)
+#define FP_LR_OFFSET64 ((uint32_t)8)
+#define FP_MAX_NUM_TO_EVALUATE (50)
+
int db_run_mode;
volatile int pbtcpu = -1;
unsigned int debug_boot_arg;
+/*
+ * Backtrace a single frame.
+ */
+void
+print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker,
+ boolean_t is_64_bit, boolean_t nvram_format)
+{
+ int i = 0;
+ addr64_t lr;
+ addr64_t fp;
+ addr64_t fp_for_ppn;
+ ppnum_t ppn;
+ boolean_t dump_kernel_stack;
+
+ fp = topfp;
+ fp_for_ppn = 0;
+ ppn = (ppnum_t)NULL;
+
+ if (fp >= VM_MIN_KERNEL_ADDRESS)
+ dump_kernel_stack = TRUE;
+ else
+ dump_kernel_stack = FALSE;
+
+ do {
+ if ((fp == 0) || ((fp & FP_ALIGNMENT_MASK) != 0))
+ break;
+ if (dump_kernel_stack && ((fp < VM_MIN_KERNEL_ADDRESS) || (fp > VM_MAX_KERNEL_ADDRESS)))
+ break;
+ if ((!dump_kernel_stack) && (fp >=VM_MIN_KERNEL_ADDRESS))
+ break;
+
+ /* Check to see if current address will result in a different
+ ppn than previously computed (to avoid recomputation) via
+ (addr) ^ fp_for_ppn) >> PAGE_SHIFT) */
+
+ if ((((fp + FP_LR_OFFSET) ^ fp_for_ppn) >> PAGE_SHIFT) != 0x0U) {
+ ppn = pmap_find_phys(pmap, fp + FP_LR_OFFSET);
+ fp_for_ppn = fp + (is_64_bit ? FP_LR_OFFSET64 : FP_LR_OFFSET);
+ }
+ if (ppn != (ppnum_t)NULL) {
+ if (is_64_bit) {
+ lr = ml_phys_read_double_64(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET64) & PAGE_MASK));
+ } else {
+ lr = ml_phys_read_word(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET) & PAGE_MASK));
+ }
+ } else {
+ if (is_64_bit) {
+ kdb_printf("%s\t Could not read LR from frame at 0x%016llx\n", cur_marker, fp + FP_LR_OFFSET64);
+ } else {
+ kdb_printf("%s\t Could not read LR from frame at 0x%08x\n", cur_marker, (uint32_t)(fp + FP_LR_OFFSET));
+ }
+ break;
+ }
+ if (((fp ^ fp_for_ppn) >> PAGE_SHIFT) != 0x0U) {
+ ppn = pmap_find_phys(pmap, fp);
+ fp_for_ppn = fp;
+ }
+ if (ppn != (ppnum_t)NULL) {
+ if (is_64_bit) {
+ fp = ml_phys_read_double_64(((((vm_offset_t)ppn) << PAGE_SHIFT)) | (fp & PAGE_MASK));
+ } else {
+ fp = ml_phys_read_word(((((vm_offset_t)ppn) << PAGE_SHIFT)) | (fp & PAGE_MASK));
+ }
+ } else {
+ if (is_64_bit) {
+ kdb_printf("%s\t Could not read FP from frame at 0x%016llx\n", cur_marker, fp);
+ } else {
+ kdb_printf("%s\t Could not read FP from frame at 0x%08x\n", cur_marker, (uint32_t)fp);
+ }
+ break;
+ }
+
+ if (nvram_format) {
+ if (is_64_bit) {
+ kdb_printf("%s\t0x%016llx\n", cur_marker, lr);
+ } else {
+ kdb_printf("%s\t0x%08x\n", cur_marker, (uint32_t)lr);
+ }
+ } else {
+ if (is_64_bit) {
+ kdb_printf("%s\t lr: 0x%016llx fp: 0x%016llx\n", cur_marker, lr, fp);
+ } else {
+ kdb_printf("%s\t lr: 0x%08x fp: 0x%08x\n", cur_marker, (uint32_t)lr, (uint32_t)fp);
+ }
+ }
+ } while ((++i < FP_MAX_NUM_TO_EVALUATE) && (fp != topfp));
+}
void
machine_startup(void)
{
if (debug_boot_arg & DB_PRT) disable_debug_output=FALSE;
if (debug_boot_arg & DB_SLOG) systemLogDiags=TRUE;
if (debug_boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE;
+#if KDEBUG_MOJO_TRACE
+ if (debug_boot_arg & DB_PRT_KDEBUG) {
+ kdebug_serial = TRUE;
+ disable_debug_output = FALSE;
+ }
+#endif
} else {
debug_boot_arg = 0;
}
pmCPUHalt(PM_HALT_DEBUG);
}
+static int pid_from_task(task_t task)
+{
+ int pid = -1;
+
+ if (task->bsd_info)
+ pid = proc_pid(task->bsd_info);
+
+ return pid;
+}
+
void
DebuggerWithContext(
__unused unsigned int reason,
unsigned long pi_size = 0;
void *stackptr;
int cn = cpu_number();
+ task_t task = current_task();
+ int task_pid = pid_from_task(task);
+
hw_atomic_add(&debug_mode, 1);
if (!panic_is_inited) {
__asm__ volatile("movq %%rbp, %0" : "=m" (stackptr));
/* Print backtrace - callee is internally synchronized */
- panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), NULL, FALSE, NULL);
+ if ((task_pid == 1) && (init_task_died)) {
+ /* Special handling of launchd died panics */
+ print_launchd_info();
+ } else {
+ panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), NULL, FALSE, NULL);
+ }
/* everything should be printed now so copy to NVRAM
*/
}
}
-static void
+void
panic_print_symbol_name(vm_address_t search)
{
/* try searching in the kernel */
bt_tsc_timeout = rdtsc64() + PBT_TIMEOUT_CYCLES;
while(*ppbtcnt && (rdtsc64() < bt_tsc_timeout));
}
+
+static boolean_t
+debug_copyin(pmap_t p, uint64_t uaddr, void *dest, size_t size)
+{
+ size_t rem = size;
+ char *kvaddr = dest;
+
+ while (rem) {
+ ppnum_t upn = pmap_find_phys(p, uaddr);
+ uint64_t phys_src = ptoa_64(upn) | (uaddr & PAGE_MASK);
+ uint64_t phys_dest = kvtophys((vm_offset_t)kvaddr);
+ uint64_t src_rem = PAGE_SIZE - (phys_src & PAGE_MASK);
+ uint64_t dst_rem = PAGE_SIZE - (phys_dest & PAGE_MASK);
+ size_t cur_size = (uint32_t) MIN(src_rem, dst_rem);
+ cur_size = MIN(cur_size, rem);
+
+ if (upn && pmap_valid_page(upn) && phys_dest) {
+ bcopy_phys(phys_src, phys_dest, cur_size);
+ }
+ else
+ break;
+ uaddr += cur_size;
+ kvaddr += cur_size;
+ rem -= cur_size;
+ }
+ return (rem == 0);
+}
+
+void
+print_threads_registers(thread_t thread)
+{
+ x86_saved_state_t *savestate;
+
+ savestate = get_user_regs(thread);
+ kdb_printf(
+ "\nRAX: 0x%016llx, RBX: 0x%016llx, RCX: 0x%016llx, RDX: 0x%016llx\n"
+ "RSP: 0x%016llx, RBP: 0x%016llx, RSI: 0x%016llx, RDI: 0x%016llx\n"
+ "R8: 0x%016llx, R9: 0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n"
+ "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n"
+ "RFL: 0x%016llx, RIP: 0x%016llx, CS: 0x%016llx, SS: 0x%016llx\n\n",
+ savestate->ss_64.rax, savestate->ss_64.rbx, savestate->ss_64.rcx, savestate->ss_64.rdx,
+ savestate->ss_64.isf.rsp, savestate->ss_64.rbp, savestate->ss_64.rsi, savestate->ss_64.rdi,
+ savestate->ss_64.r8, savestate->ss_64.r9, savestate->ss_64.r10, savestate->ss_64.r11,
+ savestate->ss_64.r12, savestate->ss_64.r13, savestate->ss_64.r14, savestate->ss_64.r15,
+ savestate->ss_64.isf.rflags, savestate->ss_64.isf.rip, savestate->ss_64.isf.cs,
+ savestate->ss_64.isf.ss);
+}
+
+void
+print_tasks_user_threads(task_t task)
+{
+ thread_t thread = current_thread();
+ x86_saved_state_t *savestate;
+ pmap_t pmap = 0;
+ uint64_t rbp;
+ const char *cur_marker = 0;
+ int j;
+
+ for (j = 0, thread = (thread_t) queue_first(&task->threads); j < task->thread_count;
+ ++j, thread = (thread_t) queue_next(&thread->task_threads)) {
+
+ kdb_printf("Thread %p\n", thread);
+ pmap = get_task_pmap(task);
+ savestate = get_user_regs(thread);
+ rbp = savestate->ss_64.rbp;
+ print_one_backtrace(pmap, (vm_offset_t)rbp, cur_marker, TRUE, TRUE);
+ kdb_printf("\n");
+ }
+}
+
+#define PANICLOG_UUID_BUF_SIZE 256
+
+void print_uuid_info(task_t task)
+{
+ uint32_t uuid_info_count = 0;
+ mach_vm_address_t uuid_info_addr = 0;
+ boolean_t have_map = (task->map != NULL) && (ml_validate_nofault((vm_offset_t)(task->map), sizeof(struct _vm_map)));
+ boolean_t have_pmap = have_map && (task->map->pmap != NULL) && (ml_validate_nofault((vm_offset_t)(task->map->pmap), sizeof(struct pmap)));
+ int task_pid = pid_from_task(task);
+ char uuidbuf[PANICLOG_UUID_BUF_SIZE] = {0};
+ char *uuidbufptr = uuidbuf;
+ uint32_t k;
+
+ if (have_pmap && task->active && task_pid > 0) {
+ /* Read dyld_all_image_infos struct from task memory to get UUID array count & location */
+ struct user64_dyld_all_image_infos task_image_infos;
+ if (debug_copyin(task->map->pmap, task->all_image_info_addr,
+ &task_image_infos, sizeof(struct user64_dyld_all_image_infos))) {
+ uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
+ uuid_info_addr = task_image_infos.uuidArray;
+ }
+
+ /* If we get a NULL uuid_info_addr (which can happen when we catch dyld
+ * in the middle of updating this data structure), we zero the
+ * uuid_info_count so that we won't even try to save load info for this task
+ */
+ if (!uuid_info_addr) {
+ uuid_info_count = 0;
+ }
+ }
+
+ if (task_pid > 0 && uuid_info_count > 0) {
+ uint32_t uuid_info_size = sizeof(struct user64_dyld_uuid_info);
+ uint32_t uuid_array_size = uuid_info_count * uuid_info_size;
+ uint32_t uuid_copy_size = 0;
+ uint32_t uuid_image_count = 0;
+ char *current_uuid_buffer = NULL;
+ /* Copy in the UUID info array. It may be nonresident, in which case just fix up nloadinfos to 0 */
+
+ kdb_printf("\nuuid info:\n");
+ while (uuid_array_size) {
+ if (uuid_array_size <= PANICLOG_UUID_BUF_SIZE) {
+ uuid_copy_size = uuid_array_size;
+ uuid_image_count = uuid_array_size/uuid_info_size;
+ } else {
+ uuid_image_count = PANICLOG_UUID_BUF_SIZE/uuid_info_size;
+ uuid_copy_size = uuid_image_count * uuid_info_size;
+ }
+ if (have_pmap && !debug_copyin(task->map->pmap, uuid_info_addr, uuidbufptr,
+ uuid_copy_size)) {
+ kdb_printf("Error!! Failed to copy UUID info for task %p pid %d\n", task, task_pid);
+ uuid_image_count = 0;
+ break;
+ }
+
+ if (uuid_image_count > 0) {
+ current_uuid_buffer = uuidbufptr;
+ for (k = 0; k < uuid_image_count; k++) {
+ kdb_printf(" %#llx", *(uint64_t *)current_uuid_buffer);
+ current_uuid_buffer += sizeof(uint64_t);
+ uint8_t *uuid = (uint8_t *)current_uuid_buffer;
+ kdb_printf("\tuuid = <%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>\n",
+ uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], uuid[6], uuid[7], uuid[8],
+ uuid[9], uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]);
+ current_uuid_buffer += 16;
+ }
+ bzero(&uuidbuf, sizeof(uuidbuf));
+ }
+ uuid_info_addr += uuid_copy_size;
+ uuid_array_size -= uuid_copy_size;
+ }
+ }
+}
+
+void print_launchd_info(void)
+{
+ task_t task = current_task();
+ thread_t thread = current_thread();
+ volatile uint32_t *ppbtcnt = &pbtcnt;
+ uint64_t bt_tsc_timeout;
+ int cn = cpu_number();
+
+ if(pbtcpu != cn) {
+ hw_atomic_add(&pbtcnt, 1);
+ /* Spin on print backtrace lock, which serializes output
+ * Continue anyway if a timeout occurs.
+ */
+ hw_lock_to(&pbtlock, ~0U);
+ pbtcpu = cn;
+ }
+
+ print_uuid_info(task);
+ print_threads_registers(thread);
+ print_tasks_user_threads(task);
+ kdb_printf("Mac OS version: %s\n", (osversion[0] != 0) ? osversion : "Not yet set");
+ kdb_printf("Kernel version: %s\n", version);
+ panic_display_kernel_uuid();
+ panic_display_model_name();
+
+ /* Release print backtrace lock, to permit other callers in the
+ * event of panics on multiple processors.
+ */
+ hw_lock_unlock(&pbtlock);
+ hw_atomic_sub(&pbtcnt, 1);
+ /* Wait for other processors to complete output
+ * Timeout and continue after PBT_TIMEOUT_CYCLES.
+ */
+ bt_tsc_timeout = rdtsc64() + PBT_TIMEOUT_CYCLES;
+ while(*ppbtcnt && (rdtsc64() < bt_tsc_timeout));
+
+}
#if CONFIG_MTRR
#include <i386/mtrr.h>
#endif
+#if HYPERVISOR
+#include <kern/hv_support.h>
+#endif
#if CONFIG_VMX
#include <i386/vmx/vmx_cpu.h>
#endif
/* Save power management timer state */
pmTimerSave();
+#if HYPERVISOR
+ /* Notify hypervisor that we are about to sleep */
+ hv_suspend();
+#endif
+
#if CONFIG_VMX
/*
* Turn off VT, otherwise switching to legacy mode will fail
case CPUID_MODEL_CRYSTALWELL:
cpufamily = CPUFAMILY_INTEL_HASWELL;
break;
-#if !defined(XNU_HIDE_SEED)
case CPUID_MODEL_BROADWELL:
case CPUID_MODEL_BRYSTALWELL:
cpufamily = CPUFAMILY_INTEL_BROADWELL;
break;
-#endif /* not XNU_HIDE_SEED */
}
break;
}
{CPUID_LEAF7_FEATURE_RTM, "RTM"},
{CPUID_LEAF7_FEATURE_RDSEED, "RDSEED"},
{CPUID_LEAF7_FEATURE_ADX, "ADX"},
-#if !defined(XNU_HIDE_SEED)
{CPUID_LEAF7_FEATURE_SMAP, "SMAP"},
-#endif /* not XNU_HIDE_SEED */
{0, 0}
};
#define CPUID_LEAF7_FEATURE_RTM _Bit(11) /* RTM */
#define CPUID_LEAF7_FEATURE_RDSEED _Bit(18) /* RDSEED Instruction */
#define CPUID_LEAF7_FEATURE_ADX _Bit(19) /* ADX Instructions */
-#if !defined(XNU_HIDE_SEED)
#define CPUID_LEAF7_FEATURE_SMAP _Bit(20) /* Supervisor Mode Access Protect */
-#endif /* not XNU_HIDE_SEED */
/*
* The CPUID_EXTFEATURE_XXX values define 64-bit values
#define CPUID_MODEL_HASWELL 0x3C
#define CPUID_MODEL_HASWELL_EP 0x3F
#define CPUID_MODEL_HASWELL_ULT 0x45
-#if !defined(XNU_HIDE_SEED)
#define CPUID_MODEL_BROADWELL 0x3D
#define CPUID_MODEL_BROADWELL_ULX 0x3D
#define CPUID_MODEL_BROADWELL_ULT 0x3D
#define CPUID_MODEL_BRYSTALWELL 0x47
-#endif /* not XNU_HIDE_SEED */
#define CPUID_VMM_FAMILY_UNKNOWN 0x0
#define CPUID_VMM_FAMILY_VMWARE 0x1
}
}
-bool panic_phys_range_before(const void *addr, uint64_t *pphys,
+boolean_t panic_phys_range_before(const void *addr, uint64_t *pphys,
panic_phys_range_t *range)
{
*pphys = kvtophys((vm_offset_t)addr);
#if XNU_KERNEL_PRIVATE
#include <stdint.h>
-#include <stdbool.h>
+#include <mach/i386/boolean.h>
typedef struct {
uint64_t opaque[6];
uint64_t len;
} panic_phys_range_t;
-bool panic_phys_range_before(const void *addr, uint64_t *pphys,
+boolean_t panic_phys_range_before(const void *addr, uint64_t *pphys,
panic_phys_range_t *range);
#endif // XNU_KERNEL_PRIVATE
__asm__ volatile("invlpg (%0)" :: "r" (addr) : "memory");
}
+static inline void clac(void)
+{
+ __asm__ volatile("clac");
+}
+
+static inline void stac(void)
+{
+ __asm__ volatile("stac");
+}
+
/*
* Access to machine-specific registers (available on 586 and better only)
* Note: the rd* operations modify the parameters directly (without using
goto debugger_entry;
}
+ /*
+ * Additionally check for SMAP faults...
+ * which are characterized by page-present and
+ * the AC bit unset (i.e. not from copyin/out path).
+ */
+ if (__improbable(code & T_PF_PROT &&
+ pmap_smap_enabled &&
+ (saved_state->isf.rflags & EFL_AC) == 0)) {
+ goto debugger_entry;
+ }
+
/*
* If we're not sharing cr3 with the user
* and we faulted in copyio,
const char *trapname = "Unknown";
pal_cr_t cr0, cr2, cr3, cr4;
boolean_t potential_smep_fault = FALSE, potential_kernel_NX_fault = FALSE;
+ boolean_t potential_smap_fault = FALSE;
pal_get_control_registers( &cr0, &cr2, &cr3, &cr4 );
assert(ml_get_interrupts_enabled() == FALSE);
} else if (regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
potential_kernel_NX_fault = TRUE;
}
+ } else if (pmap_smap_enabled &&
+ regs->isf.trapno == T_PAGE_FAULT &&
+ regs->isf.err & T_PF_PROT &&
+ regs->cr2 < VM_MAX_USER_PAGE_ADDRESS &&
+ regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
+ potential_smap_fault = TRUE;
}
#undef panic
virtualized ? " VMM" : "",
potential_kernel_NX_fault ? " Kernel NX fault" : "",
potential_smep_fault ? " SMEP/User NX fault" : "",
- "");
+ potential_smap_fault ? " SMAP fault" : "");
/*
* This next statement is not executed,
* but it's needed to stop the compiler using tail call optimization
extern volatile perfCallback perfIntHook;
extern void panic_i386_backtrace(void *, int, const char *, boolean_t, x86_saved_state_t *);
+extern void print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker, boolean_t is_64_bit, boolean_t nvram_format);
+extern void print_tasks_user_threads(task_t task);
+extern void print_threads_registers(thread_t thread);
+extern void print_uuid_info(task_t task);
+extern void print_launchd_info(void);
+
#if MACH_KDP
extern boolean_t kdp_i386_trap(
unsigned int,
kaddr = (mach_vm_address_t)port->ip_kobject;
ip_unlock(port);
+
if (0 != kaddr && is_ipc_kobject(*typep))
*addrp = VM_KERNEL_UNSLIDE_OR_PERM(kaddr);
else
/* BSD KERN COMPONENT INTERFACE */
task_t bsd_init_task = TASK_NULL;
+boolean_t init_task_died;
char init_task_failure_data[1024];
extern unsigned int not_in_kdp; /* Skip acquiring locks if we're in kdp */
unsigned int systemLogDiags = FALSE;
unsigned int panicDebugging = FALSE;
unsigned int logPanicDataToScreen = FALSE;
+unsigned int kdebug_serial = FALSE;
int mach_assert = 1;
strlcpy(model_name, model_string, sizeof(model_name));
}
-static void panic_display_model_name(void) {
+void panic_display_model_name(void) {
char tmp_model_name[sizeof(model_name)];
if (ml_nofault_copy((vm_offset_t) &model_name, (vm_offset_t) &tmp_model_name, sizeof(model_name)) != sizeof(model_name))
kdb_printf("System model name: %s\n", tmp_model_name);
}
-static void panic_display_kernel_uuid(void) {
+void panic_display_kernel_uuid(void) {
char tmp_kernel_uuid[sizeof(kernel_uuid_string)];
if (ml_nofault_copy((vm_offset_t) &kernel_uuid_string, (vm_offset_t) &tmp_kernel_uuid, sizeof(kernel_uuid_string)) != sizeof(kernel_uuid_string))
#if CONFIG_ZLEAKS
extern boolean_t panic_include_ztrace;
extern struct ztrace* top_ztrace;
+void panic_print_symbol_name(vm_address_t search);
+
/*
* Prints the backtrace most suspected of being a leaker, if we paniced in the zone allocator.
* top_ztrace and panic_include_ztrace comes from osfmk/kern/zalloc.c
{
if(panic_include_ztrace == TRUE) {
unsigned int i = 0;
+ boolean_t keepsyms = FALSE;
+
+ PE_parse_boot_argn("keepsyms", &keepsyms, sizeof (keepsyms));
struct ztrace top_ztrace_copy;
/* Make sure not to trip another panic if there's something wrong with memory */
kdb_printf("\nBacktrace suspected of leaking: (outstanding bytes: %lu)\n", (uintptr_t)top_ztrace_copy.zt_size);
/* Print the backtrace addresses */
for (i = 0; (i < top_ztrace_copy.zt_depth && i < MAX_ZTRACE_DEPTH) ; i++) {
- kdb_printf("%p\n", top_ztrace_copy.zt_stack[i]);
+ kdb_printf("%p ", top_ztrace_copy.zt_stack[i]);
+ if (keepsyms) {
+ panic_print_symbol_name((vm_address_t)top_ztrace_copy.zt_stack[i]);
+ }
+ kdb_printf("\n");
}
/* Print any kexts in that backtrace, along with their link addresses so we can properly blame them */
kmod_panic_dump((vm_offset_t *)&top_ztrace_copy.zt_stack[0], top_ztrace_copy.zt_depth);
extern unsigned int panicDebugging;
extern unsigned int logPanicDataToScreen;
+extern unsigned int kdebug_serial;
extern int db_run_mode;
void panic_display_zprint(void);
void panic_display_kernel_aslr(void);
void panic_display_hibb(void);
+void panic_display_model_name(void);
+void panic_display_kernel_uuid(void);
#if CONFIG_ZLEAKS
void panic_display_ztrace(void);
#endif /* CONFIG_ZLEAKS */
* post-panic crashdump/paniclog
* dump.
*/
-#define DB_NMI_BTN_ENA 0x8000 /* Enable button to directly trigger NMI */
+#define DB_NMI_BTN_ENA 0x8000 /* Enable button to directly trigger NMI */
+#define DB_PRT_KDEBUG 0x10000 /* kprintf KDEBUG traces */
#if DEBUG
/*
hv_callbacks_t hv_callbacks = {
.dispatch = NULL, /* thread is being dispatched for execution */
.preempt = NULL, /* thread is being preempted */
+ .suspend = NULL, /* system is being suspended */
.thread_destroy = NULL, /* thread is being destroyed */
.task_destroy = NULL, /* task is being destroyed */
.volatile_state = NULL, /* thread state is becoming volatile */
lck_mtx_unlock(hv_support_lck_mtx);
break;
} else {
- hv_callbacks.memory_pressure(NULL);
+ hv_callbacks.memory_pressure();
}
lck_mtx_unlock(hv_support_lck_mtx);
}
hv_callbacks = (hv_callbacks_t) {
.dispatch = NULL,
.preempt = NULL,
+ .suspend = NULL,
.thread_destroy = NULL,
.task_destroy = NULL,
.volatile_state = NULL,
lck_mtx_unlock(hv_support_lck_mtx);
}
+/* system suspend notification */
+void
+hv_suspend(void) {
+ if (hv_callbacks_enabled) {
+ hv_callbacks.suspend();
+ }
+}
+
/* dispatch hv_task_trap/hv_thread_trap syscalls to trap handlers,
fail for invalid index or absence of trap handlers, trap handler is
responsible for validating targets */
HV_THREAD_TRAP = 1
} hv_trap_type_t;
-typedef kern_return_t (*hv_trap_t) (void *thread_target, uint64_t arg);
-typedef void (*hv_callback_0_t)(void *target);
-typedef void (*hv_callback_1_t)(void *target, int argument);
+typedef kern_return_t (*hv_trap_t) (void *target, uint64_t arg);
typedef struct {
const hv_trap_t *traps;
} hv_trap_table_t;
typedef struct {
- hv_callback_0_t dispatch;
- hv_callback_0_t preempt;
- hv_callback_0_t thread_destroy;
- hv_callback_0_t task_destroy;
- hv_callback_1_t volatile_state;
- hv_callback_0_t memory_pressure;
+ void (*dispatch)(void *vcpu);
+ void (*preempt)(void *vcpu);
+ void (*suspend)(void);
+ void (*thread_destroy)(void *vcpu);
+ void (*task_destroy)(void *vm);
+ void (*volatile_state)(void *vcpu, int state);
+ void (*memory_pressure)(void);
} hv_callbacks_t;
extern hv_callbacks_t hv_callbacks;
const hv_trap_t *traps, unsigned trap_count);
extern void hv_release_traps(hv_trap_type_t trap_type);
extern kern_return_t hv_set_callbacks(hv_callbacks_t callbacks);
-extern void hv_release_callbacks(void) ;
+extern void hv_release_callbacks(void);
+extern void hv_suspend(void);
extern kern_return_t hv_task_trap(uint64_t index, uint64_t arg);
extern kern_return_t hv_thread_trap(uint64_t index, uint64_t arg);
uint64_t off_time_interval;
timer_call_data_t on_timer;
+ uint64_t on_timer_deadline;
boolean_t on_timer_programmed;
boolean_t class_sfi_is_enabled;
/* Push out on-timer */
on_timer_deadline = now + sfi_classes[i].off_time_interval;
+ sfi_classes[i].on_timer_deadline = on_timer_deadline;
+
timer_call_enter1(&sfi_classes[i].on_timer, NULL, on_timer_deadline, TIMER_CALL_SYS_CRITICAL);
} else {
/* If this class no longer needs SFI, make sure the timer is cancelled */
sfi_classes[i].class_in_on_phase = TRUE;
if (sfi_classes[i].on_timer_programmed) {
sfi_classes[i].on_timer_programmed = FALSE;
+ sfi_classes[i].on_timer_deadline = ~0ULL;
timer_call_cancel(&sfi_classes[i].on_timer);
}
}
* Since we have the sfi_lock held and have changed "class_in_on_phase", we expect
* no new threads to be put on this wait queue until the global "off timer" has fired.
*/
+
sfi_class->class_in_on_phase = TRUE;
+ sfi_class->on_timer_programmed = FALSE;
+
kret = wait_queue_wakeup64_all(&sfi_class->wait_queue,
CAST_EVENT64_T(sfi_class_id),
THREAD_AWAKENED);
return (KERN_SUCCESS);
}
+/* Defers SFI off and per-class on timers (if live) by the specified interval
+ * in Mach Absolute Time Units. Currently invoked to align with the global
+ * forced idle mechanism. Making some simplifying assumptions, the iterative GFI
+ * induced SFI on+off deferrals form a geometric series that converges to yield
+ * an effective SFI duty cycle that is scaled by the GFI duty cycle. Initial phase
+ * alignment and congruency of the SFI/GFI periods can distort this to some extent.
+ */
+
+kern_return_t sfi_defer(uint64_t sfi_defer_matus)
+{
+ spl_t s;
+ kern_return_t kr = KERN_FAILURE;
+ s = splsched();
+
+ KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_GLOBAL_DEFER), sfi_defer_matus, 0, 0, 0, 0);
+
+ simple_lock(&sfi_lock);
+ if (!sfi_is_enabled) {
+ goto sfi_defer_done;
+ }
+
+ assert(sfi_next_off_deadline != 0);
+
+ sfi_next_off_deadline += sfi_defer_matus;
+ timer_call_enter1(&sfi_timer_call_entry, NULL, sfi_next_off_deadline, TIMER_CALL_SYS_CRITICAL);
+
+ int i;
+ for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
+ if (sfi_classes[i].class_sfi_is_enabled) {
+ if (sfi_classes[i].on_timer_programmed) {
+ uint64_t new_on_deadline = sfi_classes[i].on_timer_deadline + sfi_defer_matus;
+ sfi_classes[i].on_timer_deadline = new_on_deadline;
+ timer_call_enter1(&sfi_classes[i].on_timer, NULL, new_on_deadline, TIMER_CALL_SYS_CRITICAL);
+ }
+ }
+ }
+
+ kr = KERN_SUCCESS;
+sfi_defer_done:
+ simple_unlock(&sfi_lock);
+
+ splx(s);
+
+ return (kr);
+}
+
kern_return_t sfi_get_window(uint64_t *window_usecs)
{
void sfi_ast(thread_t thread);
void sfi_reevaluate(thread_t thread);
+kern_return_t sfi_defer(uint64_t);
#endif /* MACH_KERNEL_PRIVATE */
#endif /* _KERN_SFI_H_ */
unsigned int wake_nkdbufs = 0;
unsigned int write_trace_on_panic = 0;
unsigned int trace_typefilter = 0;
+boolean_t trace_serial = FALSE;
/* mach leak logging */
int log_leaks = 0;
#endif
#if (defined(__i386__) || defined(__x86_64__))
+ if (kdebug_serial) {
+ new_nkdbufs = 1;
+ if (trace_typefilter == 0)
+ trace_typefilter = 1;
+ }
if (turn_on_log_leaks && !new_nkdbufs)
new_nkdbufs = 200000;
if (trace_typefilter)
#endif /* HYPERVISOR */
thread_template.t_chud = 0;
+
+#if (DEVELOPMENT || DEBUG)
+ thread_template.t_page_creation_throttled_hard = 0;
+ thread_template.t_page_creation_throttled_soft = 0;
+#endif /* DEVELOPMENT || DEBUG */
+ thread_template.t_page_creation_throttled = 0;
thread_template.t_page_creation_count = 0;
thread_template.t_page_creation_time = 0;
thread_terminate_enqueue(
thread_t thread)
{
- KERNEL_DEBUG_CONSTANT(TRACEDBG_CODE(DBG_TRACE_DATA, TRACE_DATA_THREAD_TERMINATE) | DBG_FUNC_NONE, thread->thread_id, 0, 0, 0, 0);
+ KERNEL_DEBUG_CONSTANT(TRACE_DATA_THREAD_TERMINATE | DBG_FUNC_NONE, thread->thread_id, 0, 0, 0, 0);
simple_lock(&thread_terminate_lock);
enqueue_tail(&thread_terminate_queue, (queue_entry_t)thread);
clock_sec_t t_page_creation_time;
uint32_t t_page_creation_count;
+ uint32_t t_page_creation_throttled;
+#if (DEVELOPMENT || DEBUG)
+ uint64_t t_page_creation_throttled_hard;
+ uint64_t t_page_creation_throttled_soft;
+#endif /* DEVELOPMENT || DEBUG */
+
#define T_CHUD_MARKED 0x01 /* this thread is marked by CHUD */
#define T_IN_CHUD 0x02 /* this thread is already in a CHUD handler */
#define THREAD_PMC_FLAG 0x04 /* Bit in "t_chud" signifying PMC interest */
#define CPUFAMILY_INTEL_SANDYBRIDGE 0x5490b78c
#define CPUFAMILY_INTEL_IVYBRIDGE 0x1f65e835
#define CPUFAMILY_INTEL_HASWELL 0x10b282dc
-#if !defined(XNU_HIDE_SEED)
#define CPUFAMILY_INTEL_BROADWELL 0x582ed09c
-#endif /* not XNU_HIDE_SEED */
#define CPUFAMILY_ARM_9 0xe73283ae
#define CPUFAMILY_ARM_11 0x8ff620d8
#define CPUFAMILY_ARM_XSCALE 0x53b005f5
int vm_scale = 16;
+int vm_compressor_is_active = 0;
int vm_compression_limit = 0;
extern boolean_t vm_swap_up;
vm_compressor_swap_init();
}
+ if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED)
+ vm_compressor_is_active = 1;
+
#if CONFIG_FREEZE
memorystatus_freeze_enabled = TRUE;
#endif /* CONFIG_FREEZE */
c_seg_free_locked(c_segment_t c_seg)
{
int segno, i;
- int pages_populated;
+ int pages_populated = 0;
int32_t *c_buffer = NULL;
- uint64_t c_swap_handle;
+ uint64_t c_swap_handle = 0;
assert(!c_seg->c_on_minorcompact_q);
} c_seg_major_compact_stats;
-#define C_MAJOR_COMPACTION_AGE_APPROPRIATE 30
-#define C_MAJOR_COMPACTION_OLD_ENOUGH 300
-#define C_MAJOR_COMPACTION_SIZE_APPROPRIATE ((C_SEG_BUFSIZE * 80) / 100)
+#define C_MAJOR_COMPACTION_SIZE_APPROPRIATE ((C_SEG_BUFSIZE * 90) / 100)
boolean_t
c_compress_page(char *src, c_slot_mapping_t slot_ptr, c_segment_t *current_chead, char *scratch_buf)
{
int c_size;
- int c_rounded_size;
+ int c_rounded_size = 0;
int max_csize;
c_slot_t cs;
c_segment_t c_seg;
#define NEED_TO_HARD_THROTTLE_THIS_TASK() (vm_wants_task_throttled(current_task()) || \
(vm_page_free_count < vm_page_throttle_limit && \
- proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
+ proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) > THROTTLE_LEVEL_THROTTLED))
-#define HARD_THROTTLE_DELAY 20000 /* 20000 us == 20 ms */
-#define SOFT_THROTTLE_DELAY 2000 /* 2000 us == 2 ms */
+#define HARD_THROTTLE_DELAY 5000 /* 5000 us == 5 ms */
+#define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
+
+#define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
+#define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
+
boolean_t current_thread_aborted(void);
}
+#if (DEVELOPMENT || DEBUG)
+uint32_t vm_page_creation_throttled_hard = 0;
+uint32_t vm_page_creation_throttled_soft = 0;
+#endif /* DEVELOPMENT || DEBUG */
+
static int
-vm_page_throttled(void)
+vm_page_throttled(boolean_t page_kept)
{
clock_sec_t elapsed_sec;
clock_sec_t tv_sec;
if (thread->options & TH_OPT_VMPRIV)
return (0);
- thread->t_page_creation_count++;
-
- if (NEED_TO_HARD_THROTTLE_THIS_TASK())
+ if (thread->t_page_creation_throttled) {
+ thread->t_page_creation_throttled = 0;
+
+ if (page_kept == FALSE)
+ goto no_throttle;
+ }
+ if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
+#if (DEVELOPMENT || DEBUG)
+ thread->t_page_creation_throttled_hard++;
+ OSAddAtomic(1, &vm_page_creation_throttled_hard);
+#endif /* DEVELOPMENT || DEBUG */
return (HARD_THROTTLE_DELAY);
+ }
if ((vm_page_free_count < vm_page_throttle_limit || ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
- thread->t_page_creation_count > vm_page_creation_throttle) {
+ thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
clock_get_system_microtime(&tv_sec, &tv_usec);
elapsed_sec = tv_sec - thread->t_page_creation_time;
- if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
+ if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
+ (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
- if (elapsed_sec >= 60) {
+ if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
/*
* we'll reset our stats to give a well behaved app
* that was unlucky enough to accumulate a bunch of pages
* will remain in the throttled state
*/
thread->t_page_creation_time = tv_sec;
- thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
+ thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
}
++vm_page_throttle_count;
- if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED())
+ thread->t_page_creation_throttled = 1;
+
+ if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED()) {
+#if (DEVELOPMENT || DEBUG)
+ thread->t_page_creation_throttled_hard++;
+ OSAddAtomic(1, &vm_page_creation_throttled_hard);
+#endif /* DEVELOPMENT || DEBUG */
return (HARD_THROTTLE_DELAY);
- else
+ } else {
+#if (DEVELOPMENT || DEBUG)
+ thread->t_page_creation_throttled_soft++;
+ OSAddAtomic(1, &vm_page_creation_throttled_soft);
+#endif /* DEVELOPMENT || DEBUG */
return (SOFT_THROTTLE_DELAY);
+ }
}
thread->t_page_creation_time = tv_sec;
thread->t_page_creation_count = 0;
}
+no_throttle:
+ thread->t_page_creation_count++;
+
return (0);
}
-
/*
* check for various conditions that would
* prevent us from creating a ZF page...
* object == m->object
*/
static vm_fault_return_t
-vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
+vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state, boolean_t page_throttle)
{
int throttle_delay;
return (VM_FAULT_RETRY);
}
}
- if ((throttle_delay = vm_page_throttled())) {
+ if (page_throttle == TRUE && (throttle_delay = vm_page_throttled(FALSE))) {
/*
* we're throttling zero-fills...
* treat this as if we couldn't grab a page
* fault cleanup in the case of an error condition
* including resetting the thread_interrupt_level
*/
- error = vm_fault_check(object, m, first_m, interruptible_state);
+ error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
if (error != VM_FAULT_SUCCESS)
return (error);
0,
&compressed_count_delta);
+ if (type_of_fault == NULL) {
+ int throttle_delay;
+
+ /*
+ * we weren't called from vm_fault, so we
+ * need to apply page creation throttling
+ * do it before we re-acquire any locks
+ */
+ if (my_fault_type == DBG_COMPRESSOR_FAULT) {
+ if ((throttle_delay = vm_page_throttled(TRUE))) {
+ VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
+ delay(throttle_delay);
+ }
+ }
+ }
vm_object_lock(object);
assert(object->paging_in_progress > 0);
* fault cleanup in the case of an error condition
* including resetting the thread_interrupt_level
*/
- error = vm_fault_check(object, m, first_m, interruptible_state);
+ error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
if (error != VM_FAULT_SUCCESS)
return (error);
*/
assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
- if ((throttle_delay = vm_page_throttled())) {
- /*
- * drop all of our locks...
- * wait until the free queue is
- * pumped back up and then
- * redrive the fault
- */
- if (object != cur_object)
- vm_object_unlock(cur_object);
- vm_object_unlock(object);
- vm_map_unlock_read(map);
- if (real_map != map)
- vm_map_unlock(real_map);
-
- VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
-
- delay(throttle_delay);
-
- if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
- THREAD_UNINT :
- THREAD_ABORTSAFE))
- goto RetryFault;
- kr = KERN_ABORTED;
- goto done;
- }
/*
* If objects match, then
* object->copy must not be NULL (else control
kr = KERN_MEMORY_ERROR;
goto done;
}
- if ((throttle_delay = vm_page_throttled())) {
- /*
- * drop all of our locks...
- * wait until the free queue is
- * pumped back up and then
- * redrive the fault
- */
- if (object != cur_object)
- vm_object_unlock(cur_object);
- vm_object_unlock(object);
- vm_map_unlock_read(map);
- if (real_map != map)
- vm_map_unlock(real_map);
-
- VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
-
- delay(throttle_delay);
-
- if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
- THREAD_UNINT :
- THREAD_ABORTSAFE))
- goto RetryFault;
- kr = KERN_ABORTED;
- goto done;
- }
if (vm_backing_store_low) {
/*
* we are protecting the system from
thread_interrupt_level(interruptible_state);
/*
- * Only throttle on faults which cause a pagein.
+ * Only I/O throttle on faults which cause a pagein/swapin.
*/
if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
throttle_lowpri_io(1);
- }
+ } else {
+ if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
+ if ((throttle_delay = vm_page_throttled(TRUE))) {
+
+ if (vm_debug_events) {
+ if (type_of_fault == DBG_COMPRESSOR_FAULT)
+ VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+ else if (type_of_fault == DBG_COW_FAULT)
+ VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+ else
+ VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+ }
+ delay(throttle_delay);
+ }
+ }
+ }
KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
(MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
((uint64_t)vaddr >> 32),
*/
new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+ new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
*new_copy = *copy;
if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
/* destroyed after successful copy_overwrite */
copy = (vm_map_copy_t)
zalloc(vm_map_copy_zone);
+ copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
vm_map_copy_first_entry(copy) =
vm_map_copy_last_entry(copy) =
vm_map_copy_to_entry(copy);
* Extract "head_copy" out of "copy".
*/
head_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+ head_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
vm_map_copy_first_entry(head_copy) =
vm_map_copy_to_entry(head_copy);
vm_map_copy_last_entry(head_copy) =
* Extract "tail_copy" out of "copy".
*/
tail_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+ tail_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
vm_map_copy_first_entry(tail_copy) =
vm_map_copy_to_entry(tail_copy);
vm_map_copy_last_entry(tail_copy) =
*/
copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+ copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
vm_map_copy_first_entry(copy) =
vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy);
copy->type = VM_MAP_COPY_ENTRY_LIST;
*/
copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+ copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
vm_map_copy_first_entry(copy) =
vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy);
copy->type = VM_MAP_COPY_ENTRY_LIST;
*/
copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+ copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
copy->type = VM_MAP_COPY_OBJECT;
copy->cpy_object = object;
copy->offset = offset;
}
#endif
+boolean_t
+vm_map_store_has_RB_support( struct vm_map_header *hdr )
+{
+ if ((void*)hdr->rb_head_store.rbh_root == (void*)(int)SKIP_RB_TREE) {
+ return FALSE;
+ }
+ return TRUE;
+}
+
void
vm_map_store_init( struct vm_map_header *hdr )
{
vm_map_store_init_ll( hdr );
#ifdef VM_MAP_STORE_USE_RB
- vm_map_store_init_rb( hdr );
+ if (vm_map_store_has_RB_support( hdr )) {
+ vm_map_store_init_rb( hdr );
+ }
#endif
}
#ifdef VM_MAP_STORE_USE_LL
return (vm_map_store_lookup_entry_ll( map, address, entry ));
#elif defined VM_MAP_STORE_USE_RB
- return (vm_map_store_lookup_entry_rb( map, address, entry ));
+ if (vm_map_store_has_RB_support( &map->hdr )) {
+ return (vm_map_store_lookup_entry_rb( map, address, entry ));
+ } else {
+ panic("VM map lookups need RB tree support.\n");
+ return FALSE; /* For compiler warning.*/
+ }
#endif
}
{
vm_map_store_copy_insert_ll(map, after_where, copy);
#ifdef VM_MAP_STORE_USE_RB
- vm_map_store_copy_insert_rb(map, after_where, copy);
+ if (vm_map_store_has_RB_support( &map->hdr )) {
+ vm_map_store_copy_insert_rb(map, after_where, copy);
+ }
#endif
}
assert(entry->vme_start < entry->vme_end);
vm_map_store_entry_link_ll(mapHdr, after_where, entry);
#ifdef VM_MAP_STORE_USE_RB
- vm_map_store_entry_link_rb(mapHdr, after_where, entry);
+ if (vm_map_store_has_RB_support( mapHdr )) {
+ vm_map_store_entry_link_rb(mapHdr, after_where, entry);
+ }
#endif
#if MAP_ENTRY_INSERTION_DEBUG
fastbacktrace(&entry->vme_insertion_bt[0],
} else {
update_first_free_ll(VMEL_map, VMEL_map->first_free);
#ifdef VM_MAP_STORE_USE_RB
- update_first_free_rb(VMEL_map, VMEL_map->first_free);
+ if (vm_map_store_has_RB_support( &VMEL_map->hdr )) {
+ update_first_free_rb(VMEL_map, VMEL_map->first_free);
+ }
#endif
}
}
{
vm_map_store_entry_unlink_ll(mapHdr, entry);
#ifdef VM_MAP_STORE_USE_RB
- vm_map_store_entry_unlink_rb(mapHdr, entry);
+ if (vm_map_store_has_RB_support( mapHdr )) {
+ vm_map_store_entry_unlink_rb(mapHdr, entry);
+ }
#endif
}
vm_map_store_update( map, entry, VM_MAP_ENTRY_DELETE);
update_first_free_ll(VMEU_map, VMEU_first_free);
#ifdef VM_MAP_STORE_USE_RB
- update_first_free_rb(VMEU_map, VMEU_first_free);
+ if (vm_map_store_has_RB_support( &VMEU_map->hdr )) {
+ update_first_free_rb(VMEU_map, VMEU_first_free);
+ }
#endif
}
int nentries = copy->cpy_hdr.nentries;
vm_map_store_copy_reset_ll(copy, entry, nentries);
#ifdef VM_MAP_STORE_USE_RB
- vm_map_store_copy_reset_rb(copy, entry, nentries);
+ if (vm_map_store_has_RB_support( ©->c_u.hdr )) {
+ vm_map_store_copy_reset_rb(copy, entry, nentries);
+ }
#endif
}
{
update_first_free_ll(map, first_free);
#ifdef VM_MAP_STORE_USE_RB
- update_first_free_rb(map, first_free);
+ if (vm_map_store_has_RB_support( &map->hdr )) {
+ update_first_free_rb(map, first_free);
+ }
#endif
}
(map)->hint = (value); \
MACRO_END
+#define SKIP_RB_TREE 0xBAADC0D1
+
#define VM_MAP_ENTRY_CREATE 1
#define VM_MAP_ENTRY_DELETE 2
#if MACH_ASSERT
boolean_t first_free_is_valid_store( struct _vm_map*);
#endif
+boolean_t vm_map_store_has_RB_support( struct vm_map_header *hdr );
#endif /* _VM_VM_MAP_STORE_H */
uint32_t vm_pageout_considered_page = 0;
uint32_t vm_page_filecache_min = 0;
-#define VM_PAGE_FILECACHE_MIN 50000
#define ANONS_GRABBED_LIMIT 2
/*
if (cache_evict_throttle)
cache_evict_throttle--;
+ /*
+ * don't let the filecache_min fall below 33% of available memory...
+ *
+ * on systems w/o the compressor/swapper, the filecache is always
+ * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
+ * since most (if not all) of the anonymous pages are in the
+ * throttled queue (which isn't counted as available) which
+ * effectively disables this filter
+ */
+ vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 3);
exceeded_burst_throttle = FALSE;
/*
page_prev_state = PAGE_STATE_INACTIVE;
anons_grabbed = 0;
+ if (vm_page_pageable_external_count < vm_page_filecache_min) {
+ if ((++reactivated_this_call % 100))
+ goto must_activate_page;
+ /*
+ * steal 1% of the file backed pages even if
+ * we are under the limit that has been set
+ * for a healthy filecache
+ */
+ }
break;
}
}
vm_page_deactivate(m);
vm_pageout_inactive_deactivated++;
} else {
+must_activate_page:
/*
* The page was/is being used, so put back on active list.
*/
vm_page_free_target = vm_page_free_min + 5;
vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 3);
- vm_page_creation_throttle = vm_page_free_target * 3;
}
/*
void
vm_pageout_reinit_tuneables(void)
{
- vm_page_filecache_min = (uint32_t) (max_mem / PAGE_SIZE) / 15;
-
- if (vm_page_filecache_min < VM_PAGE_FILECACHE_MIN)
- vm_page_filecache_min = VM_PAGE_FILECACHE_MIN;
-
vm_compressor_minorcompact_threshold_divisor = 18;
vm_compressor_majorcompact_threshold_divisor = 22;
vm_compressor_unthrottle_threshold_divisor = 32;
if (vm_pageout_burst_inactive_throttle == 0)
vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
-#if !CONFIG_JETSAM
- vm_page_filecache_min = (uint32_t) (max_mem / PAGE_SIZE) / 20;
- if (vm_page_filecache_min < VM_PAGE_FILECACHE_MIN)
- vm_page_filecache_min = VM_PAGE_FILECACHE_MIN;
-#endif
-
/*
* Set kernel task to low backing store privileged
* status
}
src_upl->decmp_io_upl = (void *)upl;
src_upl->ref_count++;
- upl_unlock(src_upl);
upl->flags |= UPL_DECMP_REAL_IO;
upl->decmp_io_upl = (void *)src_upl;
-
+ upl_unlock(src_upl);
}
#endif /* CONFIG_IOSCHED */
unsigned int vm_page_free_target = 0;
unsigned int vm_page_free_min = 0;
unsigned int vm_page_throttle_limit = 0;
-uint32_t vm_page_creation_throttle = 0;
unsigned int vm_page_inactive_target = 0;
unsigned int vm_page_anonymous_min = 0;
unsigned int vm_page_inactive_min = 0;
goto reenter_pg_on_q;
}
- vm_pageout_scan_wants_object = m_object;
vm_page_unlock_queues();
mutex_pause(try_failed_count++);
continue;
} else {
l_object = m_object;
- vm_pageout_scan_wants_object = VM_OBJECT_NULL;
}
}
if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error) {
vm_object_unlock(l_object);
l_object = NULL;
}
- vm_pageout_scan_wants_object = VM_OBJECT_NULL;
while (retval == 0) {
vm_object_unlock(l_object);
l_object = NULL;
}
- vm_pageout_scan_wants_object = VM_OBJECT_NULL;
vm_page_unlock_queues();
#define COPYINPHYS 3 /* from user virtual to kernel physical */
#define COPYOUTPHYS 4 /* from kernel physical to user virtual */
+#if DEVELOPMENT
+typedef struct {
+ uint64_t timestamp;
+ thread_t thread;
+ uintptr_t cr4;
+ uint8_t cpuid;
+ uint8_t smap_state;
+ uint8_t copyio_active;
+} smaplog_entry_t;
+
+#define SMAPLOG_BUFFER_SIZE (50)
+static smaplog_entry_t smaplog_cbuf[SMAPLOG_BUFFER_SIZE];
+static uint32_t smaplog_head = 0;
+
+static void
+smaplog_add_entry(boolean_t enabling)
+{
+ uint32_t index = 0;
+ thread_t thread = current_thread();
+
+ do {
+ index = smaplog_head;
+ } while (!OSCompareAndSwap(index, (index + 1) % SMAPLOG_BUFFER_SIZE, &smaplog_head));
+
+ assert(index < SMAPLOG_BUFFER_SIZE);
+ assert(smaplog_head < SMAPLOG_BUFFER_SIZE);
+ assert(thread);
+
+ smaplog_cbuf[index].timestamp = mach_absolute_time();
+ smaplog_cbuf[index].thread = thread;
+ smaplog_cbuf[index].cpuid = cpu_number();
+ smaplog_cbuf[index].cr4 = get_cr4();
+ smaplog_cbuf[index].smap_state = enabling;
+ smaplog_cbuf[index].copyio_active = (thread->machine.specFlags & CopyIOActive) ? 1 : 0;
+}
+#endif /* DEVELOPMENT */
+
+extern boolean_t pmap_smap_enabled;
+static inline void user_access_enable(void) {
+ if (pmap_smap_enabled) {
+ stac();
+#if DEVELOPMENT
+ smaplog_add_entry(TRUE);
+#endif
+ }
+}
+static inline void user_access_disable(void) {
+ if (pmap_smap_enabled) {
+ clac();
+#if DEVELOPMENT
+ smaplog_add_entry(FALSE);
+#endif
+ }
+}
static int
copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
*/
recursive_CopyIOActive = thread->machine.specFlags & CopyIOActive;
thread->machine.specFlags |= CopyIOActive;
+ user_access_enable();
if (no_shared_cr3) {
istate = ml_set_interrupts_enabled(FALSE);
if (get_cr3_base() != pmap->pm_cr3)
break;
}
+ user_access_disable();
if (!recursive_CopyIOActive) {
thread->machine.specFlags &= ~CopyIOActive;
}
pmap_smep_enabled = TRUE;
}
}
+ if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
+ boolean_t nsmap;
+ if (!PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
+ set_cr4(get_cr4() | CR4_SMAP);
+ pmap_smap_enabled = TRUE;
+ }
+ }
if (cdp->cpu_fixed_pmcs_enabled) {
boolean_t enable = TRUE;
if (pmap_smep_enabled)
printf("PMAP: Supervisor Mode Execute Protection enabled\n");
+ if (pmap_smap_enabled)
+ printf("PMAP: Supervisor Mode Access Protection enabled\n");
#if DEBUG
printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
uint32_t pciConfigSpaceEndBusNumber;
uint32_t csrActiveConfig;
uint32_t csrPendingConfig;
- uint32_t __reserved4[728];
+ uint32_t boot_SMC_plimit;
+ uint32_t __reserved4[727];
} boot_args;
char * my_pathp = NULL;
- uid_t euid,ruid;
+ uid_t ruid;
struct stat my_sb;
FILE * file_handle;
file_handle = fopen(FILE_NOTME, "w");
fclose(file_handle);
- /* Currently running as root (through setreuid manipulation), switch to running as the current user. */
- euid = geteuid();
+ /* Currently running as root (through settid manipulation), switch to running as the current user. */
ruid = getuid();
- setreuid(ruid, ruid);
+ my_err = syscall(SYS_settid, ruid, KAUTH_GID_NONE);
+ if (my_err != 0) {
+ printf("Failed to settid to non-root with error %d:%s\n", errno, strerror(errno));
+ goto test_failed_exit;
+ }
/* Create a file that the current user owns */
file_handle = fopen(FILE_ME, "w");
}
/* Reset to running as root */
- setreuid(ruid, euid);
-
+ my_err = syscall(SYS_settid, KAUTH_UID_NONE, KAUTH_GID_NONE);
+ if (my_err != 0) {
+ printf("Failed to revert to root using settid with error %d:%s\n", errno, strerror(errno));
+ goto test_failed_exit;
+ }
if(error_occurred == 1) {
goto test_failed_exit;
}
char * my_namep = NULL;
char * my_pathp = NULL;
- uid_t euid,ruid;
+ uid_t ruid;
struct stat my_sb;
FILE * file_handle;
file_handle = fopen(FILE_NOTME, "w");
fclose(file_handle);
- /* Currently running as root (through setreuid manipulation), switch to running as the current user. */
- euid = geteuid();
+ /* Currently running as root (through settid manipulation), switch to running as the current user. */
ruid = getuid();
- setreuid(ruid, ruid);
+ my_err = syscall(SYS_settid, ruid, KAUTH_GID_NONE);
+ if (my_err != 0) {
+ printf("Failed to settid to non-root with error %d:%s\n", errno, strerror(errno));
+ goto test_failed_exit;
+ }
/* Create a file that the current user owns */
file_handle = fopen(FILE_ME, "w");
}
/* Reset to running as root */
- setreuid(ruid, euid);
+ my_err = syscall(SYS_settid, KAUTH_UID_NONE, KAUTH_GID_NONE);
+ if (my_err != 0) {
+ printf("Failed to settid revert to root with error %d:%s\n", errno, strerror(errno));
+ goto test_failed_exit;
+ }
if(error_occurred == 1) {
goto test_failed_exit;