From: Apple Date: Fri, 25 Sep 2015 15:59:39 +0000 (+0000) Subject: xnu-2782.20.48.tar.gz X-Git-Tag: os-x-10103^0 X-Git-Url: https://git.saurik.com/apple/xnu.git/commitdiff_plain/04b8595b18b1b41ac7a206e4b3d51a635f8413d7 xnu-2782.20.48.tar.gz --- diff --git a/bsd/conf/files b/bsd/conf/files index 54b4ef14d..d4ce218f8 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -381,6 +381,7 @@ bsd/hfs/hfs_cnode.c optional hfs bsd/hfs/hfs_encodinghint.c standard bsd/hfs/hfs_encodings.c standard bsd/hfs/hfs_endian.c optional hfs +bsd/hfs/hfs_fsinfo.c optional hfs bsd/hfs/hfs_hotfiles.c optional hfs bsd/hfs/hfs_link.c optional hfs bsd/hfs/hfs_lookup.c optional hfs diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index 25f6d7c8e..dd02ad502 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -20,7 +20,8 @@ */ /* - * Portions copyright (c) 2011, Joyent, Inc. All rights reserved. + * Portions Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Portions Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -2583,9 +2584,10 @@ dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu, { dtrace_speculation_t *spec; dtrace_buffer_t *src, *dest; - uintptr_t daddr, saddr, dlimit; + uintptr_t daddr, saddr, dlimit, slimit; dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE; intptr_t offs; + uint64_t timestamp; if (which == 0) return; @@ -2661,7 +2663,38 @@ dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu, } /* - * We have the space; copy the buffer across. (Note that this is a + * We have sufficient space to copy the speculative buffer into the + * primary buffer. First, modify the speculative buffer, filling + * in the timestamp of all entries with the current time. The data + * must have the commit() time rather than the time it was traced, + * so that all entries in the primary buffer are in timestamp order. + */ + timestamp = dtrace_gethrtime(); + saddr = (uintptr_t)src->dtb_tomax; + slimit = saddr + src->dtb_offset; + while (saddr < slimit) { + size_t size; + dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr; + + if (dtrh->dtrh_epid == DTRACE_EPIDNONE) { + saddr += sizeof (dtrace_epid_t); + continue; + } + + ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs)); + size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size; + + ASSERT(saddr + size <= slimit); + ASSERT(size >= sizeof(dtrace_rechdr_t)); + ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX); + + DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp); + + saddr += size; + } + + /* + * Copy the buffer across. (Note that this is a * highly subobtimal bcopy(); in the unlikely event that this becomes * a serious performance issue, a high-performance DTrace-specific * bcopy() should obviously be invented.) @@ -6119,8 +6152,23 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, tomax = buf->dtb_tomax; ASSERT(tomax != NULL); - if (ecb->dte_size != 0) - DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid); + /* + * Build and store the record header corresponding to the ECB. + */ + if (ecb->dte_size != 0) { + dtrace_rechdr_t dtrh; + + if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) { + mstate.dtms_timestamp = dtrace_gethrtime(); + mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP; + } + + ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t)); + + dtrh.dtrh_epid = ecb->dte_epid; + DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp); + DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh); + } mstate.dtms_epid = ecb->dte_epid; mstate.dtms_present |= DTRACE_MSTATE_EPID; @@ -6268,7 +6316,9 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, continue; switch (act->dta_kind) { - case DTRACEACT_SPECULATE: + case DTRACEACT_SPECULATE: { + dtrace_rechdr_t *dtrh = NULL; + ASSERT(buf == &state->dts_buffer[cpuid]); buf = dtrace_speculation_buffer(state, cpuid, val); @@ -6291,9 +6341,23 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, ASSERT(tomax != NULL); if (ecb->dte_size != 0) - DTRACE_STORE(uint32_t, tomax, offs, - ecb->dte_epid); - continue; + continue; + + ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t)); + dtrh = ((void *)(tomax + offs)); + dtrh->dtrh_epid = ecb->dte_epid; + + /* + * When the speculation is committed, all of + * the records in the speculative buffer will + * have their timestamps set to the commit + * time. Until then, it is set to a sentinel + * value, for debugability. + */ + DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX); + + continue; + } case DTRACEACT_CHILL: if (dtrace_priv_kernel_destructive(state)) @@ -9559,9 +9623,9 @@ dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe) /* * The default size is the size of the default action: recording - * the epid. + * the header. */ - ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t); + ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t); ecb->dte_alignment = sizeof (dtrace_epid_t); epid = state->dts_epid++; @@ -9661,122 +9725,85 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) static void dtrace_ecb_resize(dtrace_ecb_t *ecb) { - uint32_t maxalign = sizeof (dtrace_epid_t); - uint32_t align = sizeof (uint8_t), offs, diff; dtrace_action_t *act; - int wastuple = 0; + uint32_t curneeded = UINT32_MAX; uint32_t aggbase = UINT32_MAX; - dtrace_state_t *state = ecb->dte_state; /* - * If we record anything, we always record the epid. (And we always - * record it first.) + * If we record anything, we always record the dtrace_rechdr_t. (And + * we always record it first.) */ - offs = sizeof (dtrace_epid_t); - ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t); + ecb->dte_size = sizeof (dtrace_rechdr_t); + ecb->dte_alignment = sizeof (dtrace_epid_t); for (act = ecb->dte_action; act != NULL; act = act->dta_next) { dtrace_recdesc_t *rec = &act->dta_rec; + ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1); - if ((align = rec->dtrd_alignment) > maxalign) - maxalign = align; - - if (!wastuple && act->dta_intuple) { - /* - * This is the first record in a tuple. Align the - * offset to be at offset 4 in an 8-byte aligned - * block. - */ - diff = offs + sizeof (dtrace_aggid_t); - - if ((diff = (diff & (sizeof (uint64_t) - 1)))) - offs += sizeof (uint64_t) - diff; - - aggbase = offs - sizeof (dtrace_aggid_t); - ASSERT(!(aggbase & (sizeof (uint64_t) - 1))); - } - - /*LINTED*/ - if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) { - /* - * The current offset is not properly aligned; align it. - */ - offs += align - diff; - } - - rec->dtrd_offset = offs; - - if (offs + rec->dtrd_size > ecb->dte_needed) { - ecb->dte_needed = offs + rec->dtrd_size; - - if (ecb->dte_needed > state->dts_needed) - state->dts_needed = ecb->dte_needed; - } + ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment); if (DTRACEACT_ISAGG(act->dta_kind)) { dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act; - dtrace_action_t *first = agg->dtag_first, *prev; - ASSERT(rec->dtrd_size != 0 && first != NULL); - ASSERT(wastuple); + ASSERT(rec->dtrd_size != 0); + ASSERT(agg->dtag_first != NULL); + ASSERT(act->dta_prev->dta_intuple); ASSERT(aggbase != UINT32_MAX); + ASSERT(curneeded != UINT32_MAX); agg->dtag_base = aggbase; - while ((prev = first->dta_prev) != NULL && - DTRACEACT_ISAGG(prev->dta_kind)) { - agg = (dtrace_aggregation_t *)prev; - first = agg->dtag_first; - } + curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment); + rec->dtrd_offset = curneeded; + curneeded += rec->dtrd_size; + ecb->dte_needed = MAX(ecb->dte_needed, curneeded); - if (prev != NULL) { - offs = prev->dta_rec.dtrd_offset + - prev->dta_rec.dtrd_size; - } else { - offs = sizeof (dtrace_epid_t); + aggbase = UINT32_MAX; + curneeded = UINT32_MAX; + } else if (act->dta_intuple) { + if (curneeded == UINT32_MAX) { + /* + * This is the first record in a tuple. Align + * curneeded to be at offset 4 in an 8-byte + * aligned block. + */ + ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple); + ASSERT(aggbase == UINT32_MAX); + + curneeded = P2PHASEUP(ecb->dte_size, + sizeof (uint64_t), sizeof (dtrace_aggid_t)); + + aggbase = curneeded - sizeof (dtrace_aggid_t); + ASSERT(IS_P2ALIGNED(aggbase, + sizeof (uint64_t))); } - wastuple = 0; - } else { - if (!act->dta_intuple) - ecb->dte_size = offs + rec->dtrd_size; - offs += rec->dtrd_size; + curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment); + rec->dtrd_offset = curneeded; + curneeded += rec->dtrd_size; + } else { + /* tuples must be followed by an aggregation */ + ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple); + ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment); + rec->dtrd_offset = ecb->dte_size; + ecb->dte_size += rec->dtrd_size; + ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size); } - - wastuple = act->dta_intuple; } if ((act = ecb->dte_action) != NULL && !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) && - ecb->dte_size == sizeof (dtrace_epid_t)) { + ecb->dte_size == sizeof (dtrace_rechdr_t)) { /* - * If the size is still sizeof (dtrace_epid_t), then all + * If the size is still sizeof (dtrace_rechdr_t), then all * actions store no data; set the size to 0. */ - ecb->dte_alignment = maxalign; ecb->dte_size = 0; - - /* - * If the needed space is still sizeof (dtrace_epid_t), then - * all actions need no additional space; set the needed - * size to 0. - */ - if (ecb->dte_needed == sizeof (dtrace_epid_t)) - ecb->dte_needed = 0; - - return; } - /* - * Set our alignment, and make sure that the dte_size and dte_needed - * are aligned to the size of an EPID. - */ - ecb->dte_alignment = maxalign; - ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) & - ~(sizeof (dtrace_epid_t) - 1); - ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) & - ~(sizeof (dtrace_epid_t) - 1); - ASSERT(ecb->dte_size <= ecb->dte_needed); + ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t)); + ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t))); + ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed); } static dtrace_action_t * @@ -10147,7 +10174,7 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc) break; case DTRACEACT_SPECULATE: - if (ecb->dte_size > sizeof (dtrace_epid_t)) + if (ecb->dte_size > sizeof (dtrace_rechdr_t)) return (EINVAL); if (dp == NULL) @@ -10260,7 +10287,7 @@ dtrace_ecb_action_remove(dtrace_ecb_t *ecb) ecb->dte_action = NULL; ecb->dte_action_last = NULL; - ecb->dte_size = sizeof (dtrace_epid_t); + ecb->dte_size = 0; } static void @@ -10534,11 +10561,13 @@ dtrace_buffer_switch(dtrace_buffer_t *buf) caddr_t tomax = buf->dtb_tomax; caddr_t xamot = buf->dtb_xamot; dtrace_icookie_t cookie; + hrtime_t now; ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH)); ASSERT(!(buf->dtb_flags & DTRACEBUF_RING)); cookie = dtrace_interrupt_disable(); + now = dtrace_gethrtime(); buf->dtb_tomax = xamot; buf->dtb_xamot = tomax; buf->dtb_xamot_drops = buf->dtb_drops; @@ -10549,6 +10578,8 @@ dtrace_buffer_switch(dtrace_buffer_t *buf) buf->dtb_drops = 0; buf->dtb_errors = 0; buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED); + buf->dtb_interval = now - buf->dtb_switched; + buf->dtb_switched = now; dtrace_interrupt_enable(cookie); } @@ -16617,6 +16648,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv desc.dtbd_drops = buf->dtb_drops; desc.dtbd_errors = buf->dtb_errors; desc.dtbd_oldest = buf->dtb_xamot_offset; + desc.dtbd_timestamp = dtrace_gethrtime(); lck_mtx_unlock(&dtrace_lock); @@ -16669,6 +16701,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv desc.dtbd_drops = buf->dtb_xamot_drops; desc.dtbd_errors = buf->dtb_xamot_errors; desc.dtbd_oldest = 0; + desc.dtbd_timestamp = buf->dtb_switched; lck_mtx_unlock(&dtrace_lock); diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h index 0f3771a22..e3898a3a1 100644 --- a/bsd/hfs/hfs.h +++ b/bsd/hfs/hfs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -106,6 +106,9 @@ extern struct timezone gTimeZone; /* How many free extents to cache per volume */ #define kMaxFreeExtents 10 +/* The maximum time hfs locks can be held while performing hfs statistics gathering */ +#define HFS_FSINFO_MAX_LOCKHELD_TIME 20 * 1000000ULL /* at most 20 milliseconds. */ + /* * HFS_MINFREE gives the minimum acceptable percentage * of file system blocks which may be free (but this @@ -715,20 +718,6 @@ extern int hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp extern int hfs_bmap(struct vnode *, daddr_t, struct vnode **, daddr64_t *, unsigned int *); -extern int hfs_fsync(struct vnode *, int, int, struct proc *); - -extern int hfs_access(struct vnode *, mode_t, kauth_cred_t, struct proc *); - -extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid); - -extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state); - -extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks); - -extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, - u_int32_t numBlocks, u_int32_t *alloc_count); - -extern int hfs_isrbtree_active (struct hfsmount *hfsmp); extern errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock); @@ -904,6 +893,7 @@ extern int hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, extern int hfs_update(struct vnode *, int); +extern int hfs_fsync(struct vnode *, int, int, struct proc *); /***************************************************************************** Functions from hfs_xattr.c @@ -929,6 +919,9 @@ int hfs_getxattr_internal(cnode_t *, struct vnop_getxattr_args *, int hfs_xattr_write(vnode_t vp, const char *name, const void *data, size_t size); int hfs_setxattr_internal(struct cnode *, const void *, size_t, struct vnop_setxattr_args *, struct hfsmount *, u_int32_t); +extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid); +extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state); + /***************************************************************************** @@ -951,6 +944,23 @@ extern cnid_t hfs_currentparent(cnode_t *cp); extern cnid_t hfs_currentcnid(cnode_t *cp); +/***************************************************************************** + Functions from VolumeAllocation.c + ******************************************************************************/ +extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, + u_int32_t numBlocks); + +extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, + u_int32_t numBlocks, u_int32_t *alloc_count); + +extern int hfs_isrbtree_active (struct hfsmount *hfsmp); + +/***************************************************************************** + Functions from hfs_fsinfo.c + ******************************************************************************/ +extern errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data); +extern void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry); + #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ #endif /* __HFS__ */ diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c index c2f92f02a..89589de28 100644 --- a/bsd/hfs/hfs_cnode.c +++ b/bsd/hfs/hfs_cnode.c @@ -2448,18 +2448,20 @@ hfs_unlock_truncate(struct cnode *cp, enum hfs_lockflags flags) vnode_t vp = NULL, rvp = NULL; /* - * Deal with any pending set sizes. We need to call - * ubc_setsize before we drop the exclusive lock. Ideally, - * hfs_unlock should be called before hfs_unlock_truncate but - * that's a lot to ask people to remember :-) + * If there are pending set sizes, the cnode lock should be dropped + * first. */ +#if DEBUG + assert(!(cp->c_lockowner == thread + && ISSET(cp->c_flag, C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE))); +#elif DEVELOPMENT if (cp->c_lockowner == thread && ISSET(cp->c_flag, C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE)) { - // hfs_unlock will do the setsize calls for us - hfs_unlock(cp); - hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + printf("hfs: hfs_unlock_truncate called with C_NEED_DATA/RSRC_SETSIZE set (caller: 0x%llx)\n", + (uint64_t)VM_KERNEL_UNSLIDE(__builtin_return_address(0))); } - +#endif + if (cp->c_need_dvnode_put_after_truncate_unlock) { vp = cp->c_vp; cp->c_need_dvnode_put_after_truncate_unlock = false; diff --git a/bsd/hfs/hfs_fsctl.h b/bsd/hfs/hfs_fsctl.h index f7f3c26b1..b90b722b5 100644 --- a/bsd/hfs/hfs_fsctl.h +++ b/bsd/hfs/hfs_fsctl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2014 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2015 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -54,6 +54,7 @@ struct hfs_journal_info { }; +// Will be deprecated and replaced by hfs_fsinfo struct hfsinfo_metadata { uint32_t total; uint32_t extents; @@ -64,6 +65,189 @@ struct hfsinfo_metadata { uint32_t reserved[4]; }; +/* + * Flags for hfs_fsinfo_data structure + */ +#define HFS_FSINFO_CLASS_A 0x0001 /* Information for class A files requested */ +#define HFS_FSINFO_CLASS_B 0x0002 /* Information for class B files requested */ +#define HFS_FSINFO_CLASS_C 0x0004 /* Information for class C files requested */ +#define HFS_FSINFO_CLASS_D 0x0008 /* Information for class D files requested */ + +/* + * Maximum number of buckets to represent range from 0 to 1TB (2^40) in + * increments of power of 2, and one catch-all bucket for anything that + * is greater than 1TB + */ +#define HFS_FSINFO_DATA_MAX_BUCKETS 42 + +/* + * Maximum number of buckets to represents percentage range from 0 to 100 + * in increments of 10. + */ +#define HFS_FSINFO_PERCENT_MAX_BUCKETS 10 + +/* + * Maximum number of buckets to represent number of file/directory name characters + * (range 1 to 255) in increments of 5. + */ +#define HFS_FSINFO_NAME_MAX_BUCKETS 51 + +/* + * Version number to ensure that the caller and the kernel have same understanding + * of the hfs_fsinfo_data structure. This version needs to be bumped whenever the + * number of buckets is changed. + */ +#define HFS_FSINFO_VERSION 1 + +/* + * hfs_fsinfo_data is generic data structure to aggregate information like sizes + * or counts in buckets of power of 2. Each bucket represents a range of values + * that is determined based on its index in the array. Specifically, buckets[i] + * represents values that are greater than or equal to 2^(i-1) and less than 2^i, + * except the last bucket which represents range greater than or equal to 2^(i-1) + * + * The current maximum number of buckets is 41, so we can represent range from + * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of + * anything that is greater than or equal to 1TB. + * + * For example, + * bucket[0] -> greater than or equal to 0 and less than 1 + * bucket[1] -> greater than or equal to 1 and less than 2 + * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024 + * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB + * bucket[41] -> greater than or equal to 2^(41-1) = 1TB + * + * Note that fsctls that populate this data structure can take long time to + * execute as this operation can be I/O intensive (traversing btrees) and compute + * intensive. + * + * WARNING: Any changes to this structure should also update version number to + * ensure that the clients and kernel are reading/writing correctly. + */ + +/* + * The header includes the user input fields. + */ +typedef struct hfs_fsinfo_header { + uint32_t request_type; + uint16_t version; + uint16_t flags; +} hfs_fsinfo_header_t; + +struct hfs_fsinfo_data { + hfs_fsinfo_header_t header; + uint32_t bucket[HFS_FSINFO_DATA_MAX_BUCKETS]; +}; + +/* + * Structure to represent information about metadata files + * + * WARNING: Any changes to this structure should also update version number to + * ensure that the clients and kernel are reading/writing correctly. + */ +struct hfs_fsinfo_metadata { + hfs_fsinfo_header_t header; + uint32_t extents; + uint32_t catalog; + uint32_t allocation; + uint32_t attribute; + uint32_t journal; +}; + +/* + * Structure to represent distribution of number of file name characters + * in increments of 5s. Each bucket represents a range of values that is + * determined based on its index in the array. So bucket[i] represents values + * that are greater than or equal to (i*5) and less than ((i+1)*10). + * + * Since this structure represents range of file name characters and the + * maximum number of unicode characters in HFS+ is 255, the maximum number + * of buckets will be 52 [0..51]. + * + * For example, + * bucket[4] -> greater than or equal to 20 and less than 25 characters + * bucket[51] -> equal to 255 characters + * + * WARNING: Any changes to this structure should also update version number to + * ensure that the clients and kernel are reading/writing correctly. + */ +struct hfs_fsinfo_name { + hfs_fsinfo_header_t header; + uint32_t bucket[HFS_FSINFO_NAME_MAX_BUCKETS]; +}; + +/* + * Structure to represent information about content protection classes + * + * WARNING: Any changes to this structure should also update version number to + * ensure that the clients and kernel are reading/writing correctly. + */ +struct hfs_fsinfo_cprotect { + hfs_fsinfo_header_t header; + uint32_t class_A; + uint32_t class_B; + uint32_t class_C; + uint32_t class_D; + uint32_t class_E; + uint32_t class_F; +}; + +/* + * Union of all the different values returned by HFSIOC_FSINFO fsctl + */ +union hfs_fsinfo { + hfs_fsinfo_header_t header; + struct hfs_fsinfo_data data; + struct hfs_fsinfo_metadata metadata; + struct hfs_fsinfo_name name; + struct hfs_fsinfo_cprotect cprotect; +}; +typedef union hfs_fsinfo hfs_fsinfo; + +/* + * Type of FSINFO requested, specified by the caller in request_type field + */ +enum { + /* Information about number of allocation blocks for each metadata file, returns struct hfs_fsinfo_metadata */ + HFS_FSINFO_METADATA_BLOCKS_INFO = 1, + + /* Information about number of extents for each metadata file, returns struct hfs_fsinfo_metadata */ + HFS_FSINFO_METADATA_EXTENTS = 2, + + /* Information about percentage of free nodes vs used nodes in metadata btrees, returns struct hfs_fsinfo_metadata */ + HFS_FSINFO_METADATA_PERCENTFREE = 3, + + /* Distribution of number of extents for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */ + HFS_FSINFO_FILE_EXTENT_COUNT = 4, + + /* Distribution of extent sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */ + HFS_FSINFO_FILE_EXTENT_SIZE = 5, + + /* Distribution of file sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */ + HFS_FSINFO_FILE_SIZE = 6, + + /* Distribution of valence for all directories, returns struct hfs_fsinfo_data */ + HFS_FSINFO_DIR_VALENCE = 7, + + /* Distribution of file/directory name size in unicode characters, returns struct hfs_fsinfo_name */ + HFS_FSINFO_NAME_SIZE = 8, + + /* Distribution of extended attribute sizes, returns hfs_fsinfo_data */ + HFS_FSINFO_XATTR_SIZE = 9, + + /* Distribution of free space for the entire file system, returns struct hfs_fsinfo_data */ + HFS_FSINFO_FREE_EXTENTS = 10, + + /* Information about number of files belonging to each class, returns hfs_fsinfo_cprotect */ + HFS_FSINFO_FILE_CPROTECT_COUNT = 11, + + /* + * Distribution of symbolic link sizes for data files (data fork, no rsrc fork, no xattr), + * returns struct hfs_fsinfo_data + */ + HFS_FSINFO_SYMLINK_SIZE = 12, +}; + /* HFS FS CONTROL COMMANDS */ @@ -166,6 +350,8 @@ struct hfsinfo_metadata { /* + * XXX: Will be deprecated and replaced by HFSIOC_GET_FSINFO + * * Get information about number of file system allocation blocks used by metadata * files on the volume, including individual btrees and journal file. The caller * can determine the size of file system allocation block using value returned as @@ -178,6 +364,10 @@ struct hfsinfo_metadata { #define HFSIOC_CS_FREESPACE_TRIM _IOWR('h', 39, u_int32_t) #define HFS_CS_FREESPACE_TRIM IOCBASECMD(HFSIOC_CS_FREESPACE_TRIM) +/* Get file system information for the given volume */ +#define HFSIOC_GET_FSINFO _IOWR('h', 45, hfs_fsinfo) +#define HFS_GET_FSINFO IOCBASECMD(HFSIOC_GET_FSINFO) + #endif /* __APPLE_API_UNSTABLE */ #endif /* ! _HFS_FSCTL_H_ */ diff --git a/bsd/hfs/hfs_fsinfo.c b/bsd/hfs/hfs_fsinfo.c new file mode 100644 index 000000000..d3071086a --- /dev/null +++ b/bsd/hfs/hfs_fsinfo.c @@ -0,0 +1,891 @@ +/* + * Copyright (c) 2014-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +#include "hfs.h" +#include "hfs_fsctl.h" +#include "hfs_endian.h" +#include "hfscommon/headers/BTreesInternal.h" +#include "hfscommon/headers/BTreesPrivate.h" +#include "hfscommon/headers/FileMgrInternal.h" + +#if CONFIG_PROTECT +#include +#endif + + +union HFSPlusRecord { + HFSPlusCatalogFolder folder_record; + HFSPlusCatalogFile file_record; + HFSPlusCatalogThread thread_record; + HFSPlusExtentRecord extent_record; + HFSPlusAttrRecord attr_record; +}; +typedef union HFSPlusRecord HFSPlusRecord; + +union HFSPlusKey { + HFSPlusExtentKey extent_key; + HFSPlusAttrKey attr_key; +}; +typedef union HFSPlusKey HFSPlusKey; + +typedef enum traverse_btree_flag { + + //If set, extents btree will also be traversed along with catalog btree, so grab correct locks upfront + TRAVERSE_BTREE_EXTENTS = 1, + + // Getting content-protection attributes, allocate enough space to accomodate the records. + TRAVERSE_BTREE_XATTR_CPROTECT = 2, + +} traverse_btree_flag_t; + + + +static errno_t hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo); +static errno_t hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo); +static errno_t hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo); +static errno_t fsinfo_file_extent_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_file_extent_size_catalog_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_file_extent_size_overflow_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_file_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_dir_valence_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_name_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_xattr_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, traverse_btree_flag_t flags, void *fsinfo, + int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *)); +static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo); +static void fsinfo_free_extents_callback(void *data, off_t free_extent_size); +#if CONFIG_PROTECT +static errno_t fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +#endif +static errno_t fsinfo_symlink_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); + +/* + * Entry function for all the fsinfo requests from hfs_vnop_ioctl() + * Depending on the type of request, this function will call the + * appropriate sub-function and return success or failure back to + * the caller. + */ +__private_extern__ +errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data) +{ + int error = 0; + hfs_fsinfo *fsinfo_union; + uint32_t request_type; + uint32_t header_len = sizeof(hfs_fsinfo_header_t); + + fsinfo_union = (hfs_fsinfo *)a_data; + request_type = fsinfo_union->header.request_type; + + // Zero out output fields to fsinfo_union, keep the user input fields intact. + bzero((char *)fsinfo_union + header_len, sizeof(hfs_fsinfo) - header_len); + + switch (request_type) { + case HFS_FSINFO_METADATA_BLOCKS_INFO: + error = hfs_fsinfo_metadata_blocks(hfsmp, &(fsinfo_union->metadata)); + break; + + case HFS_FSINFO_METADATA_EXTENTS: + error = hfs_fsinfo_metadata_extents(hfsmp, &(fsinfo_union->metadata)); + break; + + case HFS_FSINFO_METADATA_PERCENTFREE: + error = hfs_fsinfo_metadata_percentfree(hfsmp, &(fsinfo_union->metadata)); + break; + + case HFS_FSINFO_FILE_EXTENT_COUNT: + /* Traverse catalog btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, TRAVERSE_BTREE_EXTENTS, &(fsinfo_union->data), fsinfo_file_extent_count_callback); + break; + + case HFS_FSINFO_FILE_EXTENT_SIZE: + /* Traverse the catalog btree first */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_catalog_callback); + if (error) { + break; + } + /* Traverse the overflow extents btree now */ + error = traverse_btree(hfsmp, kHFSExtentsFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_overflow_callback); + break; + + case HFS_FSINFO_FILE_SIZE: + /* Traverse catalog btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_size_callback); + break; + + case HFS_FSINFO_DIR_VALENCE: + /* Traverse catalog btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_dir_valence_callback); + break; + + case HFS_FSINFO_NAME_SIZE: + /* Traverse catalog btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->name), &fsinfo_name_size_callback); + break; + + case HFS_FSINFO_XATTR_SIZE: + /* Traverse attribute btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSAttributesFileID, 0, &(fsinfo_union->data), &fsinfo_xattr_size_callback); + break; + + case HFS_FSINFO_FREE_EXTENTS: + error = hfs_fsinfo_free_extents(hfsmp, &(fsinfo_union->data)); + break; + + case HFS_FSINFO_SYMLINK_SIZE: + /* Traverse catalog btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_symlink_size_callback); + break; + +#if CONFIG_PROTECT + case HFS_FSINFO_FILE_CPROTECT_COUNT: + /* Traverse attribute btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSAttributesFileID, TRAVERSE_BTREE_XATTR_CPROTECT, &(fsinfo_union->cprotect), &fsinfo_cprotect_count_callback); + break; +#endif + + default: + return ENOTSUP; + }; + + return error; +} + +/* + * This function provides information about total number of allocation blocks + * for each individual metadata file. + */ +static errno_t +hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo) +{ + int lockflags = 0; + int ret_lockflags = 0; + + /* + * Getting number of allocation blocks for all metadata files + * should be a relatively quick operation, so we grab locks for all + * the btrees at the same time + */ + lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE; + ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); + + /* Get information about all the btrees */ + fsinfo->extents = hfsmp->hfs_extents_cp->c_datafork->ff_blocks; + fsinfo->catalog = hfsmp->hfs_catalog_cp->c_datafork->ff_blocks; + fsinfo->allocation = hfsmp->hfs_allocation_cp->c_datafork->ff_blocks; + if (hfsmp->hfs_attribute_cp) + fsinfo->attribute = hfsmp->hfs_attribute_cp->c_datafork->ff_blocks; + else + fsinfo->attribute = 0; + + /* Done with btrees, give up the locks */ + hfs_systemfile_unlock(hfsmp, ret_lockflags); + + /* Get information about journal file */ + fsinfo->journal = howmany(hfsmp->jnl_size, hfsmp->blockSize); + + return 0; +} + +/* + * Helper function to count the number of valid extents in a file fork structure + */ +static uint32_t +hfs_count_extents_fp(struct filefork *ff) +{ + int i; + uint32_t count = 0; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (ff->ff_data.cf_extents[i].blockCount == 0) { + break; + } + count++; + } + return count; +} + + +/* + * This is a helper function that counts the total number of valid + * extents in all the overflow extent records for given fileID + * in overflow extents btree + */ +static errno_t +hfs_count_overflow_extents(struct hfsmount *hfsmp, uint32_t fileID, uint32_t *num_extents) +{ + int error; + FCB *fcb; + struct BTreeIterator *iterator = NULL; + FSBufferDescriptor btdata; + HFSPlusExtentKey *extentKey; + HFSPlusExtentRecord extentData; + uint32_t extent_count = 0; + int i; + + fcb = VTOF(hfsmp->hfs_extents_vp); + MALLOC(iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK | M_ZERO); + + extentKey = (HFSPlusExtentKey *) &iterator->key; + extentKey->keyLength = kHFSPlusExtentKeyMaximumLength; + extentKey->forkType = kHFSDataForkType; + extentKey->fileID = fileID; + extentKey->startBlock = 0; + + btdata.bufferAddress = &extentData; + btdata.itemSize = sizeof(HFSPlusExtentRecord); + btdata.itemCount = 1; + + /* Search for overflow extent record */ + error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator); + + /* + * We used startBlock of zero, so we will not find any records and errors + * are expected. It will also position the iterator just before the first + * overflow extent record for given fileID (if any). + */ + if (error && error != fsBTRecordNotFoundErr && error != fsBTEndOfIterationErr) + goto out; + error = 0; + + for (;;) { + + if (msleep(NULL, NULL, PINOD | PCATCH, + "hfs_fsinfo", NULL) == EINTR) { + error = EINTR; + break; + } + + error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); + if (error != 0) { + /* These are expected errors, so mask them */ + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + + /* If we encounter different fileID, stop the iteration */ + if (extentKey->fileID != fileID) { + break; + } + + if (extentKey->forkType != kHFSDataForkType) + break; + + /* This is our record of interest; only count the datafork extents. */ + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extentData[i].blockCount == 0) { + break; + } + extent_count++; + } + } + +out: + FREE(iterator, M_TEMP); + + if (error == 0) { + *num_extents = extent_count; + } + return MacToVFSError(error); +} + +/* + * This function provides information about total number of extents (including + * extents from overflow extents btree, if any) for each individual metadata + * file. + */ +static errno_t +hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo) +{ + int error = 0; + int lockflags = 0; + int ret_lockflags = 0; + uint32_t overflow_count; + + /* + * Counting the number of extents for all metadata files should + * be a relatively quick operation, so we grab locks for all the + * btrees at the same time + */ + lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE; + ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); + + /* Get number of extents for extents overflow btree */ + fsinfo->extents = hfs_count_extents_fp(hfsmp->hfs_extents_cp->c_datafork); + + /* Get number of extents for catalog btree */ + fsinfo->catalog = hfs_count_extents_fp(hfsmp->hfs_catalog_cp->c_datafork); + if (fsinfo->catalog >= kHFSPlusExtentDensity) { + error = hfs_count_overflow_extents(hfsmp, kHFSCatalogFileID, &overflow_count); + if (error) { + goto out; + } + fsinfo->catalog += overflow_count; + } + + /* Get number of extents for allocation file */ + fsinfo->allocation = hfs_count_extents_fp(hfsmp->hfs_allocation_cp->c_datafork); + if (fsinfo->allocation >= kHFSPlusExtentDensity) { + error = hfs_count_overflow_extents(hfsmp, kHFSAllocationFileID, &overflow_count); + if (error) { + goto out; + } + fsinfo->allocation += overflow_count; + } + + /* + * Get number of extents for attribute btree. + * hfs_attribute_cp might be NULL. + */ + if (hfsmp->hfs_attribute_cp) { + fsinfo->attribute = hfs_count_extents_fp(hfsmp->hfs_attribute_cp->c_datafork); + if (fsinfo->attribute >= kHFSPlusExtentDensity) { + error = hfs_count_overflow_extents(hfsmp, kHFSAttributesFileID, &overflow_count); + if (error) { + goto out; + } + fsinfo->attribute += overflow_count; + } + } + /* Journal always has one extent */ + fsinfo->journal = 1; +out: + hfs_systemfile_unlock(hfsmp, ret_lockflags); + return error; +} + +/* + * Helper function to calculate percentage i.e. X is what percent of Y? + */ +static inline uint32_t +hfs_percent(uint32_t X, uint32_t Y) +{ + return (X * 100ll) / Y; +} + +/* + * This function provides percentage of free nodes vs total nodes for each + * individual metadata btrees, i.e. for catalog, overflow extents and + * attributes btree. This information is not applicable for allocation + * file and journal file. + */ +static errno_t +hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo) +{ + int lockflags = 0; + int ret_lockflags = 0; + BTreeControlBlockPtr btreePtr; + uint32_t free_nodes, total_nodes; + + /* + * Getting total and used nodes for all metadata btrees should + * be a relatively quick operation, so we grab locks for all the + * btrees at the same time + */ + lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE; + ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); + + /* Overflow extents btree */ + btreePtr = VTOF(hfsmp->hfs_extents_vp)->fcbBTCBPtr; + total_nodes = btreePtr->totalNodes; + free_nodes = btreePtr->freeNodes; + fsinfo->extents = hfs_percent(free_nodes, total_nodes); + + /* Catalog btree */ + btreePtr = VTOF(hfsmp->hfs_catalog_vp)->fcbBTCBPtr; + total_nodes = btreePtr->totalNodes; + free_nodes = btreePtr->freeNodes; + fsinfo->catalog = hfs_percent(free_nodes, total_nodes); + + /* Attributes btree */ + if (hfsmp->hfs_attribute_vp) { + btreePtr = VTOF(hfsmp->hfs_attribute_vp)->fcbBTCBPtr; + total_nodes = btreePtr->totalNodes; + free_nodes = btreePtr->freeNodes; + fsinfo->attribute = hfs_percent(free_nodes, total_nodes); + } + + hfs_systemfile_unlock(hfsmp, ret_lockflags); + return 0; +} + +/* + * Helper function to calculate log base 2 for given number + */ +static inline int +hfs_log2(uint64_t entry) +{ + return (63 - __builtin_clzll(entry|1)); +} + +/* + * Helper function to account for input entry into the data + * array based on its log base 2 value + */ +__private_extern__ +void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry) +{ + /* + * From hfs_fsctl.h - + * + * hfs_fsinfo_data is generic data structure to aggregate information like sizes + * or counts in buckets of power of 2. Each bucket represents a range of values + * that is determined based on its index in the array. Specifically, buckets[i] + * represents values that are greater than or equal to 2^(i-1) and less than 2^i, + * except the last bucket which represents range greater than or equal to 2^(i-1) + * + * The current maximum number of buckets is 41, so we can represent range from + * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of + * anything that is greater than or equal to 1TB. + * + * For example, + * bucket[0] -> greater than or equal to 0 and less than 1 + * bucket[1] -> greater than or equal to 1 and less than 2 + * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024 + * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB + * bucket[41] -> greater than or equal to 2^(41-1) = 1TB + */ + uint32_t bucket; + + if (entry) { + /* + * Calculate log base 2 value for the entry. + * Account for this value in the appropriate bucket. + * The last bucket is a catch-all bucket of + * anything that is greater than or equal to 1TB + */ + bucket = MIN(hfs_log2(entry) + 1, HFS_FSINFO_DATA_MAX_BUCKETS-1); + ++fsinfo->bucket[bucket]; + } else { + /* Entry is zero, so account it in 0th offset */ + fsinfo->bucket[0]++; + } +} + +/* + * Function to traverse all the records of a btree and then call caller-provided + * callback function for every record found. The type of btree is chosen based + * on the fileID provided by the caller. This fuction grabs the correct locks + * depending on the type of btree it will be traversing and flags provided + * by the caller. + * + * Note: It might drop and reacquire the locks during execution. + */ +static errno_t +traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, traverse_btree_flag_t flags, + void *fsinfo, int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *)) +{ + int error = 0; + int lockflags = 0; + int ret_lockflags = 0; + FCB *fcb; + struct BTreeIterator *iterator = NULL; + struct FSBufferDescriptor btdata; + int btree_operation; + HFSPlusRecord record; + HFSPlusKey *key; + uint64_t start, timeout_abs; + + switch(btree_fileID) { + case kHFSExtentsFileID: + fcb = VTOF(hfsmp->hfs_extents_vp); + lockflags = SFL_EXTENTS; + break; + case kHFSCatalogFileID: + fcb = VTOF(hfsmp->hfs_catalog_vp); + lockflags = SFL_CATALOG; + break; + case kHFSAttributesFileID: + // Attributes file doesn’t exist, There are no records to iterate. + if (hfsmp->hfs_attribute_vp == NULL) + return error; + fcb = VTOF(hfsmp->hfs_attribute_vp); + lockflags = SFL_ATTRIBUTE; + break; + + default: + return EINVAL; + } + + MALLOC(iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK | M_ZERO); + + /* The key is initialized to zero because we are traversing entire btree */ + key = (HFSPlusKey *)&iterator->key; + + if (flags & TRAVERSE_BTREE_EXTENTS) { + lockflags |= SFL_EXTENTS; + } + + btdata.bufferAddress = &record; + btdata.itemSize = sizeof(HFSPlusRecord); + btdata.itemCount = 1; + + /* Lock btree for duration of traversal */ + ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); + btree_operation = kBTreeFirstRecord; + + nanoseconds_to_absolutetime(HFS_FSINFO_MAX_LOCKHELD_TIME, &timeout_abs); + start = mach_absolute_time(); + + while (1) { + + if (msleep(NULL, NULL, PINOD | PCATCH, + "hfs_fsinfo", NULL) == EINTR) { + error = EINTR; + break; + } + + error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL); + if (error != 0) { + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + /* Lookup next btree record on next call to BTIterateRecord() */ + btree_operation = kBTreeNextRecord; + + /* Call our callback function and stop iteration if there are any errors */ + error = callback(hfsmp, key, &record, fsinfo); + if (error) { + break; + } + + /* let someone else use the tree after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME */ + if ((mach_absolute_time() - start) >= timeout_abs) { + + /* release b-tree locks and let someone else get the lock */ + hfs_systemfile_unlock (hfsmp, ret_lockflags); + + /* add tsleep here to force context switch and fairness */ + tsleep((caddr_t)hfsmp, PRIBIO, "hfs_fsinfo", 1); + + /* + * re-acquire the locks in the same way that we wanted them originally. + * note: it is subtle but worth pointing out that in between the time that we + * released and now want to re-acquire these locks that the b-trees may have shifted + * slightly but significantly. For example, the catalog or other b-tree could have grown + * past 8 extents and now requires the extents lock to be held in order to be safely + * manipulated. We can't be sure of the state of the b-tree from where we last left off. + */ + + ret_lockflags = hfs_systemfile_lock (hfsmp, lockflags, HFS_SHARED_LOCK); + + /* + * It's highly likely that the search key we stashed away before dropping lock + * no longer points to an existing item. Iterator's IterateRecord is able to + * re-position itself and process the next record correctly. With lock dropped, + * there might be records missed for statistic gathering, which is ok. The + * point is to get aggregate values. + */ + + start = mach_absolute_time(); + + /* loop back around and get another record */ + } + } + + hfs_systemfile_unlock(hfsmp, ret_lockflags); + FREE (iterator, M_TEMP); + return MacToVFSError(error); +} + +/* + * Callback function to get distribution of number of extents + * for all user files in given file system. Note that this only + * accounts for data fork, no resource fork. + */ +static errno_t +fsinfo_file_extent_count_callback(struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + int i; + int error = 0; + uint32_t num_extents = 0; + uint32_t num_overflow = 0; + uint32_t blockCount; + + if (record->file_record.recordType == kHFSPlusFileRecord) { + /* Count total number of extents for this file */ + for (i = 0; i < kHFSPlusExtentDensity; i++) { + blockCount = record->file_record.dataFork.extents[i].blockCount; + if (blockCount == 0) { + break; + } + num_extents++; + } + /* This file has overflow extent records, so search overflow btree */ + if (num_extents >= kHFSPlusExtentDensity) { + /* The caller also hold extents overflow btree lock */ + error = hfs_count_overflow_extents(hfsmp, record->file_record.fileID, &num_overflow); + if (error) { + goto out; + } + num_extents += num_overflow; + } + hfs_fsinfo_data_add(data, num_extents); + } +out: + return error; +} + +/* + * Callback function to get distribution of individual extent sizes + * (in bytes) for all user files in given file system from catalog + * btree only. Note that this only accounts for data fork, no resource + * fork. + */ +static errno_t fsinfo_file_extent_size_catalog_callback(__unused struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + int i; + uint32_t blockCount; + uint64_t extent_size; + + if (record->file_record.recordType == kHFSPlusFileRecord) { + /* Traverse through all valid extents */ + for (i = 0; i < kHFSPlusExtentDensity; i++) { + blockCount = record->file_record.dataFork.extents[i].blockCount; + if (blockCount == 0) { + break; + } + extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize); + hfs_fsinfo_data_add(data, extent_size); + } + } + return 0; +} + +/* + * Callback function to get distribution of individual extent sizes + * (in bytes) for all user files in given file system from overflow + * extents btree only. Note that this only accounts for data fork, + * no resource fork. + */ +static errno_t fsinfo_file_extent_size_overflow_callback(__unused struct hfsmount *hfsmp, + HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + int i; + uint32_t blockCount; + uint64_t extent_size; + + if (key->extent_key.fileID >= kHFSFirstUserCatalogNodeID) { + // Only count the data fork extents. + if (key->extent_key.forkType == kHFSDataForkType) { + for (i = 0; i < kHFSPlusExtentDensity; i++) { + blockCount = record->extent_record[i].blockCount; + if (blockCount == 0) { + break; + } + extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize); + hfs_fsinfo_data_add(data, extent_size); + } + } + } + return 0; +} + +/* + * Callback function to get distribution of file sizes (in bytes) + * for all user files in given file system. Note that this only + * accounts for data fork, no resource fork. + */ +static errno_t fsinfo_file_size_callback(__unused struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + if (record->file_record.recordType == kHFSPlusFileRecord) { + /* Record of interest, account for the size in the bucket */ + hfs_fsinfo_data_add(data, record->file_record.dataFork.logicalSize); + } + return 0; +} + +/* + * Callback function to get distribution of directory valence + * for all directories in the given file system. + */ +static errno_t fsinfo_dir_valence_callback(__unused struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + if (record->folder_record.recordType == kHFSPlusFolderRecord) { + hfs_fsinfo_data_add(data, record->folder_record.valence); + } + return 0; +} + +/* + * Callback function to get distribution of number of unicode + * characters in name for all files and directories for a given + * file system. + */ +static errno_t fsinfo_name_size_callback(__unused struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + struct hfs_fsinfo_name *fsinfo = (struct hfs_fsinfo_name *)data; + uint32_t length; + + if ((record->folder_record.recordType == kHFSPlusFolderThreadRecord) || + (record->folder_record.recordType == kHFSPlusFileThreadRecord)) { + length = record->thread_record.nodeName.length; + /* Make sure that the nodeName is bounded, otherwise return error */ + if (length > kHFSPlusMaxFileNameChars) { + return EIO; + } + + // sanity check for a name length of zero, which isn't valid on disk. + if (length == 0) + return EIO; + + /* Round it down to nearest multiple of 5 to match our buckets granularity */ + length = (length - 1)/ 5; + /* Account this value into our bucket */ + fsinfo->bucket[length]++; + } + return 0; +} + +/* + * Callback function to get distribution of size of all extended + * attributes for a given file system. + */ +static errno_t fsinfo_xattr_size_callback(__unused struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + if (record->attr_record.recordType == kHFSPlusAttrInlineData) { + /* Inline attribute */ + hfs_fsinfo_data_add(data, record->attr_record.attrData.attrSize); + } else if (record->attr_record.recordType == kHFSPlusAttrForkData) { + /* Larger attributes with extents information */ + hfs_fsinfo_data_add(data, record->attr_record.forkData.theFork.logicalSize); + } + return 0; +} + + +/* + * Callback function to get distribution of free space extents for a given file system. + */ +static void fsinfo_free_extents_callback(void *data, off_t free_extent_size) +{ + // Assume a minimum of 4 KB block size + hfs_fsinfo_data_add(data, free_extent_size / 4096); +} + +/* + * Function to get distribution of free space extents for a given file system. + */ +static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo) +{ + return hfs_find_free_extents(hfsmp, &fsinfo_free_extents_callback, fsinfo); +} + +/* + * Callback function to get distribution of symblock link sizes (in bytes) + * for all user files in given file system. Note that this only + * accounts for data fork, no resource fork. + */ +static errno_t fsinfo_symlink_size_callback(__unused struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + if (record->file_record.recordType == kHFSPlusFileRecord) { + /* Record of interest, account for the size in the bucket */ + if (S_ISLNK(record->file_record.bsdInfo.fileMode)) + hfs_fsinfo_data_add((struct hfs_fsinfo_data *)data, record->file_record.dataFork.logicalSize); + } + return 0; +} + +#if CONFIG_PROTECT +/* + * Callback function to get total number of files/directories + * for each content protection class + */ +static int fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, + HFSPlusRecord *record, void *data) +{ + struct hfs_fsinfo_cprotect *fsinfo = (struct hfs_fsinfo_cprotect *)data; + static const uint16_t cp_xattrname_utf16[] = CONTENT_PROTECTION_XATTR_NAME_CHARS; + static const size_t cp_xattrname_utf16_len = sizeof(cp_xattrname_utf16)/2; + struct cp_xattr_v5 *xattr; + size_t xattr_len = sizeof(struct cp_xattr_v5); + struct cprotect cp_entry; + struct cprotect *cp_entryp = &cp_entry; + int error = 0; + + /* Content protect xattrs are inline attributes only, so skip all others */ + if (record->attr_record.recordType != kHFSPlusAttrInlineData) + return 0; + + /* We only look at content protection xattrs */ + if ((key->attr_key.attrNameLen != cp_xattrname_utf16_len) || + (bcmp(key->attr_key.attrName, cp_xattrname_utf16, cp_xattrname_utf16_len))) { + return 0; + } + + xattr = (struct cp_xattr_v5 *)((void *)(record->attr_record.attrData.attrData)); + error = cp_read_xattr_v5(hfsmp, xattr, xattr_len, (cprotect_t *)&cp_entryp, + CP_GET_XATTR_BASIC_INFO); + if (error) + return 0; + + /* No key present, skip this record */ + if (!ISSET(cp_entry.cp_flags, CP_HAS_A_KEY)) + return 0; + + /* Now account for the persistent class */ + switch (CP_CLASS(cp_entry.cp_pclass)) { + case PROTECTION_CLASS_A: + fsinfo->class_A++; + break; + case PROTECTION_CLASS_B: + fsinfo->class_B++; + break; + case PROTECTION_CLASS_C: + fsinfo->class_C++; + break; + case PROTECTION_CLASS_D: + fsinfo->class_D++; + break; + case PROTECTION_CLASS_E: + fsinfo->class_E++; + break; + case PROTECTION_CLASS_F: + fsinfo->class_F++; + break; + }; + + return 0; +} +#endif diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index 96e8c20ed..f09bdc7d2 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1578,7 +1578,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, int hfs_vnop_ioctl( struct vnop_ioctl_args /* { vnode_t a_vp; - int a_command; + long a_command; caddr_t a_data; int a_fflag; vfs_context_t a_context; @@ -2654,6 +2654,37 @@ fail_change_next_allocation: break; } + case HFS_GET_FSINFO: { + hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data; + + /* Only root is allowed to get fsinfo */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + + /* + * Make sure that the caller's version number matches with + * the kernel's version number. This will make sure that + * if the structures being read/written into are changed + * by the kernel, the caller will not read incorrect data. + * + * The first three fields --- request_type, version and + * flags are same for all the hfs_fsinfo structures, so + * we can access the version number by assuming any + * structure for now. + */ + if (fsinfo->header.version != HFS_FSINFO_VERSION) { + return ENOTSUP; + } + + /* Make sure that the current file system is not marked inconsistent */ + if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) { + return EIO; + } + + return hfs_get_fsinfo(hfsmp, ap->a_data); + } + case HFS_CS_FREESPACE_TRIM: { int error = 0; int lockflags = 0; diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index 5bfc09c3e..0c327a792 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -3655,8 +3655,8 @@ relock: * truncate lock) */ rm_done: - hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); hfs_unlockpair(dcp, cp); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); if (recycle_rsrc) { /* inactive or reclaim on rvp will clean up the blocks from the rsrc fork */ @@ -5224,12 +5224,12 @@ out: wakeup((caddr_t)&tdcp->c_flag); } + hfs_unlockfour(fdcp, fcp, tdcp, tcp); + if (took_trunc_lock) { hfs_unlock_truncate(VTOC(tvp), HFS_LOCK_DEFAULT); } - hfs_unlockfour(fdcp, fcp, tdcp, tcp); - /* Now vnode_put the resource forks vnodes if necessary */ if (tvp_rsrc) { vnode_put(tvp_rsrc); diff --git a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c index d53fd5fd5..909ab5c1d 100644 --- a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c +++ b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c @@ -642,7 +642,7 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest if ( i != kNumExtentsToCache ) // if the buffer is not full, we must be done { - err = DeleteExtents( vcb, srcFileID, forkType, quitEarly, isHFSPlus ); // Now delete all the extent entries with the sourceID + err = DeleteExtents( vcb, srcFileID, quitEarly, forkType, isHFSPlus ); // Now delete all the extent entries with the sourceID if ( DEBUG_BUILD && err != noErr ) DebugStr("Error from DeleteExtents"); break; // we're done! diff --git a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c index b49cf439c..79547be7f 100644 --- a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c +++ b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -155,31 +155,36 @@ Optimization Routines */ -#include "../../hfs_macos_defs.h" - #include #include + + +#if !HFS_ALLOC_TEST + +#include "../../hfs_macos_defs.h" #include -#include -#include #include -#include #include -#include /* For VM Page size */ #include - #include "../../hfs.h" -#include "../../hfs_dbg.h" -#include "../../hfs_format.h" #include "../../hfs_endian.h" -#include "../../hfs_macos_defs.h" #include "../headers/FileMgrInternal.h" +#include + +#endif // !HFS_ALLOC_TEST + +#include +#include +#include +#include + +#include "../../hfs_dbg.h" +#include "../../hfs_format.h" #include "../../hfs_kdebug.h" /* Headers for unmap-on-mount support */ -#include #include #ifndef CONFIG_HFS_TRIM @@ -357,6 +362,30 @@ static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBloc static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount); static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated); +/* Functions for getting free exents */ + +typedef struct bitmap_context { + void *bitmap; // current bitmap chunk + uint32_t run_offset; // offset (in bits) from start of bitmap to start of current run + uint32_t chunk_current; // next bit to scan in the chunk + uint32_t chunk_end; // number of valid bits in this chunk + struct hfsmount *hfsmp; + struct buf *bp; + uint32_t last_free_summary_bit; // last marked free summary bit + int lockflags; + uint64_t lock_start; +} bitmap_context_t; + + +static errno_t get_more_bits(bitmap_context_t *bitmap_ctx); +static int bit_count_set(void *bitmap, int start, int end); +static int bit_count_clr(void *bitmap, int start, int end); +static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count); +static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count); +static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count); +static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set); +static int clzll(uint64_t x); + #if ALLOC_DEBUG /* * Validation Routine to verify that the TRIM list maintained by the journal @@ -5153,3 +5182,462 @@ static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated) lck_spin_unlock(&hfsmp->vcbFreeExtLock); } +#define BIT_RIGHT_MASK(bit) (0xffffffffffffffffull >> (bit)) +#define kHighBitInDoubleWordMask 0x8000000000000000ull + +static int clzll(uint64_t x) +{ + if (x == 0) + return 64; + else + return __builtin_clzll(x); +} + +#if !HFS_ALLOC_TEST + +static errno_t get_more_bits(bitmap_context_t *bitmap_ctx) +{ + uint32_t start_bit; + uint32_t iosize = 0; + uint32_t byte_offset; + uint32_t last_bitmap_block; + int error; + struct hfsmount *hfsmp = bitmap_ctx->hfsmp; +#if !HFS_ALLOC_TEST + uint64_t lock_elapsed; +#endif + + + if (bitmap_ctx->bp) + ReleaseScanBitmapRange(bitmap_ctx->bp); + + if (msleep(NULL, NULL, PINOD | PCATCH, + "hfs_fsinfo", NULL) == EINTR) { + return EINTR; + } + +#if !HFS_ALLOC_TEST + /* + * Let someone else use the allocation map after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME . + * lock_start is initialized in hfs_find_free_extents(). + */ + absolutetime_to_nanoseconds(mach_absolute_time() - bitmap_ctx->lock_start, &lock_elapsed); + + if (lock_elapsed >= HFS_FSINFO_MAX_LOCKHELD_TIME) { + + hfs_systemfile_unlock(hfsmp, bitmap_ctx->lockflags); + + /* add tsleep here to force context switch and fairness */ + tsleep((caddr_t)get_more_bits, PRIBIO, "hfs_fsinfo", 1); + + hfs_journal_lock(hfsmp); + + /* Flush the journal and wait for all I/Os to finish up */ + error = hfs_journal_flush(hfsmp, TRUE); + if (error) { + hfs_journal_unlock(hfsmp); + return error; + } + + /* + * Take bitmap lock to ensure it is not being modified while journal is still held. + * Since we are reading larger than normal blocks from the bitmap, which + * might confuse other parts of the bitmap code using normal blocks, we + * take exclusive lock here. + */ + bitmap_ctx->lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + bitmap_ctx->lock_start = mach_absolute_time(); + + /* Release the journal lock */ + hfs_journal_unlock(hfsmp); + + /* + * Bitmap is read in large block size (up to 1MB), + * unlike the runtime which reads the bitmap in the + * 4K block size. If the bitmap is read by both ways + * at the same time, it can result in multiple buf_t with + * different sizes and potentially case data corruption. + * To avoid this, we invalidate all the existing buffers + * associated with the bitmap vnode. + */ + error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); + if (error) { + /* hfs_systemfile_unlock will be called in the caller */ + return error; + } + } +#endif + + start_bit = bitmap_ctx->run_offset; + + if (start_bit >= bitmap_ctx->hfsmp->totalBlocks) { + bitmap_ctx->chunk_end = 0; + bitmap_ctx->bp = NULL; + bitmap_ctx->bitmap = NULL; + return 0; + } + + assert(start_bit % 8 == 0); + + /* + * Compute how much I/O we should generate here. + * hfs_scan_range_size will validate that the start bit + * converted into a byte offset into the bitmap file, + * is aligned on a VBMIOSize boundary. + */ + error = hfs_scan_range_size (bitmap_ctx->hfsmp, start_bit, &iosize); + if (error) + return error; + + /* hfs_scan_range_size should have verified startbit. Convert it to bytes */ + byte_offset = start_bit / kBitsPerByte; + + /* + * When the journal replays blocks, it does so by writing directly to the disk + * device (bypassing any filesystem vnodes and such). When it finishes its I/Os + * it also immediately re-reads and invalidates the range covered by the bp so + * it does not leave anything lingering in the cache (for iosize reasons). + * + * As such, it is safe to do large I/Os here with ReadBitmapRange. + * + * NOTE: It is not recommended, but it is possible to call the function below + * on sections of the bitmap that may be in core already as long as the pages are not + * dirty. In that case, we'd notice that something starting at that + * logical block of the bitmap exists in the metadata cache, and we'd check + * if the iosize requested is the same as what was already allocated for it. + * Odds are pretty good we're going to request something larger. In that case, + * we just free the existing memory associated with the buf and reallocate a + * larger range. This function should immediately invalidate it as soon as we're + * done scanning, so this shouldn't cause any coherency issues. + */ + error = ReadBitmapRange(bitmap_ctx->hfsmp, byte_offset, iosize, (uint32_t **)&bitmap_ctx->bitmap, &bitmap_ctx->bp); + if (error) + return error; + + /* + * At this point, we have a giant wired buffer that represents some portion of + * the bitmap file that we want to analyze. We may not have gotten all 'iosize' + * bytes though, so clip our ending bit to what we actually read in. + */ + last_bitmap_block = start_bit + buf_count(bitmap_ctx->bp) * kBitsPerByte; + + /* Cap the last block to the total number of blocks if required */ + if (last_bitmap_block > bitmap_ctx->hfsmp->totalBlocks) + last_bitmap_block = bitmap_ctx->hfsmp->totalBlocks; + + bitmap_ctx->chunk_current = 0; // new chunk of bitmap + bitmap_ctx->chunk_end = last_bitmap_block - start_bit; + + return 0; +} + +#endif // !HFS_ALLOC_TEST + +// Returns number of contiguous bits set at start +static int bit_count_set(void *bitmap, int start, int end) +{ + if (start == end) + return 0; + + assert(end > start); + + const int start_bit = start & 63; + const int end_bit = end & 63; + + uint64_t *p = (uint64_t *)bitmap + start / 64; + uint64_t x = ~OSSwapBigToHostInt64(*p); + + if ((start & ~63) == (end & ~63)) { + // Start and end in same 64 bits + x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit); + return clzll(x) - start_bit; + } + + // Deal with initial unaligned bit + x &= BIT_RIGHT_MASK(start_bit); + + if (x) + return clzll(x) - start_bit; + + // Go fast + ++p; + int count = 64 - start_bit; + int nquads = (end - end_bit - start - 1) / 64; + + while (nquads--) { + if (*p != 0xffffffffffffffffull) { + x = ~OSSwapBigToHostInt64(*p); + return count + clzll(x); + } + ++p; + count += 64; + } + + if (end_bit) { + x = ~OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit); + count += clzll(x); + } + + return count; +} + +/* Returns the number of a run of cleared bits: + * bitmap is a single chunk of memory being examined + * start: the start bit relative to the current buffer to be examined; start is inclusive. + * end: the end bit relative to the current buffer to be examined; end is not inclusive. + */ +static int bit_count_clr(void *bitmap, int start, int end) +{ + if (start == end) + return 0; + + assert(end > start); + + const int start_bit = start & 63; + const int end_bit = end & 63; + + uint64_t *p = (uint64_t *)bitmap + start / 64; + uint64_t x = OSSwapBigToHostInt64(*p); + + if ((start & ~63) == (end & ~63)) { + // Start and end in same 64 bits + x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit); + + return clzll(x) - start_bit; + } + + // Deal with initial unaligned bit + x &= BIT_RIGHT_MASK(start_bit); + + if (x) + return clzll(x) - start_bit; + + // Go fast + ++p; + int count = 64 - start_bit; + int nquads = (end - end_bit - start - 1) / 64; + + while (nquads--) { + if (*p) { + x = OSSwapBigToHostInt64(*p); + return count + clzll(x); + } + ++p; + count += 64; + } + + if (end_bit) { + x = OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit); + + count += clzll(x); + } + + return count; +} + +#if !HFS_ALLOC_TEST +static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set) +{ + uint32_t end, start_summary_bit, end_summary_bit; + errno_t error = 0; + + if (count == 0) + goto out; + + if (!ISSET(bitmap_ctx->hfsmp->hfs_flags, HFS_SUMMARY_TABLE)) + return 0; + + if (hfs_get_summary_index (bitmap_ctx->hfsmp, start, &start_summary_bit)) { + error = EINVAL; + goto out; + } + + end = start + count - 1; + if (hfs_get_summary_index (bitmap_ctx->hfsmp, end, &end_summary_bit)) { + error = EINVAL; + goto out; + } + + // if summary table bit has been updated with free block previously, leave it. + if ((start_summary_bit == bitmap_ctx->last_free_summary_bit) && set) + start_summary_bit++; + + for (uint32_t summary_bit = start_summary_bit; summary_bit <= end_summary_bit; summary_bit++) + hfs_set_summary (bitmap_ctx->hfsmp, summary_bit, set); + + if (!set) + bitmap_ctx->last_free_summary_bit = end_summary_bit; + +out: + return error; + +} +#endif //!HFS_ALLOC_TEST + +/* + * Read in chunks of the bitmap into memory, and find a run of cleared/set bits; + * the run can extend across chunk boundaries. + * bit_count_clr can be passed to get a run of cleared bits. + * bit_count_set can be passed to get a run of set bits. + */ +static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count) +{ + int count; + errno_t error = 0; + + *bit_count = 0; + + do { + if (bitmap_ctx->run_offset == 0 || bitmap_ctx->chunk_current == bitmap_ctx->chunk_end) { + if ((error = get_more_bits(bitmap_ctx)) != 0) + goto out; + } + + if (bitmap_ctx->chunk_end == 0) + break; + + count = fn(bitmap_ctx->bitmap, bitmap_ctx->chunk_current, bitmap_ctx->chunk_end); + + bitmap_ctx->run_offset += count; + bitmap_ctx->chunk_current += count; + *bit_count += count; + + } while (bitmap_ctx->chunk_current >= bitmap_ctx->chunk_end && count); + +out: + return error; + +} + +// Returns count of number of bits clear +static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count) +{ + return hfs_bit_count(bitmap_ctx, bit_count_clr, count); +} + +// Returns count of number of bits set +static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count) +{ + return hfs_bit_count(bitmap_ctx, bit_count_set, count); +} + +static uint32_t hfs_bit_offset(bitmap_context_t *bitmap_ctx) +{ + return bitmap_ctx->run_offset; +} + +/* + * Perform a full scan of the bitmap file. + * Note: during the scan of bitmap file, it may drop and reacquire the + * bitmap lock to let someone else use the bitmap for fairness. + * Currently it is used by HFS_GET_FSINFO statistic gathing, which + * is run while other processes might perform HFS operations. + */ + +errno_t hfs_find_free_extents(struct hfsmount *hfsmp, + void (*callback)(void *data, off_t free_extent_size), void *callback_arg) +{ + struct bitmap_context bitmap_ctx; + uint32_t count; + errno_t error = 0; + + if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) { + error = hfs_init_summary(hfsmp); + if (error) + return error; + } + + bzero(&bitmap_ctx, sizeof(struct bitmap_context)); + + /* + * The journal maintains list of recently deallocated blocks to + * issue DKIOCUNMAPs when the corresponding journal transaction is + * flushed to the disk. To avoid any race conditions, we only + * want one active trim list. Therefore we make sure that the + * journal trim list is sync'ed, empty, and not modifiable for + * the duration of our scan. + * + * Take the journal lock before flushing the journal to the disk. + * We will keep on holding the journal lock till we don't get the + * bitmap lock to make sure that no new journal transactions can + * start. This will make sure that the journal trim list is not + * modified after the journal flush and before getting bitmap lock. + * We can release the journal lock after we acquire the bitmap + * lock as it will prevent any further block deallocations. + */ + hfs_journal_lock(hfsmp); + + /* Flush the journal and wait for all I/Os to finish up */ + error = hfs_journal_flush(hfsmp, TRUE); + if (error) { + hfs_journal_unlock(hfsmp); + return error; + } + + /* + * Take bitmap lock to ensure it is not being modified. + * Since we are reading larger than normal blocks from the bitmap, which + * might confuse other parts of the bitmap code using normal blocks, we + * take exclusive lock here. + */ + bitmap_ctx.lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + +#if !HFS_ALLOC_TEST + bitmap_ctx.lock_start = mach_absolute_time(); +#endif + + /* Release the journal lock */ + hfs_journal_unlock(hfsmp); + + /* + * Bitmap is read in large block size (up to 1MB), + * unlike the runtime which reads the bitmap in the + * 4K block size. If the bitmap is read by both ways + * at the same time, it can result in multiple buf_t with + * different sizes and potentially case data corruption. + * To avoid this, we invalidate all the existing buffers + * associated with the bitmap vnode. + */ + error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); + if (error) + goto out; + + /* + * Get the list of all free extent ranges. hfs_alloc_scan_range() + * will call hfs_fsinfo_data_add() to account for all the free + * extent ranges found during scan. + */ + bitmap_ctx.hfsmp = hfsmp; + bitmap_ctx.run_offset = 0; + + while (bitmap_ctx.run_offset < hfsmp->totalBlocks) { + + uint32_t start = hfs_bit_offset(&bitmap_ctx); + + if ((error = hfs_bit_count_clr(&bitmap_ctx, &count)) != 0) + goto out; + + if (count) + callback(callback_arg, hfs_blk_to_bytes(count, hfsmp->blockSize)); + + if ((error = update_summary_table(&bitmap_ctx, start, count, false)) != 0) + goto out; + + start = hfs_bit_offset(&bitmap_ctx); + + if ((error = hfs_bit_count_set(&bitmap_ctx, &count)) != 0) + goto out; + + if ((error = update_summary_table(&bitmap_ctx, start, count, true)) != 0) + goto out; + } + +out: + if (bitmap_ctx.lockflags) { + hfs_systemfile_unlock(hfsmp, bitmap_ctx.lockflags); + } + + return error; +} + diff --git a/bsd/hfs/hfscommon/headers/FileMgrInternal.h b/bsd/hfs/hfscommon/headers/FileMgrInternal.h index 18e64caf4..30eb8a84e 100644 --- a/bsd/hfs/hfscommon/headers/FileMgrInternal.h +++ b/bsd/hfs/hfscommon/headers/FileMgrInternal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -46,11 +46,14 @@ #include #include +#if !HFS_ALLOC_TEST + #include "../../hfs.h" #include "../../hfs_macos_defs.h" #include "../../hfs_format.h" #include "../../hfs_cnode.h" +#endif #ifdef __cplusplus extern "C" { @@ -255,6 +258,9 @@ ScanUnmapBlocks(struct hfsmount *hfsmp); EXTERN_API_C( int ) hfs_init_summary (struct hfsmount *hfsmp); +errno_t hfs_find_free_extents(struct hfsmount *hfsmp, + void (*callback)(void *data, off_t), void *callback_arg); + /* File Extent Mapping routines*/ EXTERN_API_C( OSErr ) FlushExtentFile (ExtendedVCB * vcb); diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index 105922628..b344c7a9d 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -287,6 +287,7 @@ void bsd_utaskbootstrap(void); static void parse_bsd_args(void); extern task_t bsd_init_task; +extern boolean_t init_task_died; extern char init_task_failure_data[]; #if CONFIG_DEV_KMEM extern void dev_kmem_init(void); @@ -1013,6 +1014,7 @@ bsdinit_task(void) ut = (uthread_t)get_bsdthread_info(thread); bsd_init_task = get_threadtask(thread); + init_task_died = FALSE; init_task_failure_data[0] = 0; #if CONFIG_MACF diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index 1b89e2d4a..65c98080d 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -74,6 +74,14 @@ #include +extern boolean_t kdebug_serial; +#if KDEBUG_MOJO_TRACE +#include +static void kdebug_serial_print( /* forward */ + uint32_t, uint32_t, uint64_t, + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); +#endif + /* * IOP(s) * @@ -302,7 +310,6 @@ pid_t global_state_pid = -1; /* Used to control exclusive use of kd_buffer #define MACH_SysCall 0x010c0000 #define DBG_SCALL_MASK 0xffff0000 - /* task to string structure */ struct tts { @@ -392,7 +399,6 @@ kdbg_set_tracing_enabled(boolean_t enabled, uint32_t trace_type) { int s = ml_set_interrupts_enabled(FALSE); lck_spin_lock(kds_spin_lock); - if (enabled) { kdebug_enable |= trace_type; kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG; @@ -881,6 +887,12 @@ record_event: kdbp = &kdbip[coreid]; timestamp &= KDBG_TIMESTAMP_MASK; +#if KDEBUG_MOJO_TRACE + if (kdebug_enable & KDEBUG_ENABLE_SERIAL) + kdebug_serial_print(coreid, debugid, timestamp, + arg1, arg2, arg3, arg4, threadid); +#endif + retry_q: kds_raw = kdbp->kd_list_tail; @@ -1057,6 +1069,14 @@ record_event: cpu = cpu_number(); kdbp = &kdbip[cpu]; + +#if KDEBUG_MOJO_TRACE + if (kdebug_enable & KDEBUG_ENABLE_SERIAL) + kdebug_serial_print(cpu, debugid, + mach_absolute_time() & KDBG_TIMESTAMP_MASK, + arg1, arg2, arg3, arg4, arg5); +#endif + retry_q: kds_raw = kdbp->kd_list_tail; @@ -1168,7 +1188,7 @@ kernel_debug_string(const char *message) /* Stuff the message string in the args and log it. */ strncpy((char *)arg, message, MIN(sizeof(arg), strlen(message))); KERNEL_DEBUG_EARLY( - (TRACEDBG_CODE(DBG_TRACE_INFO, 4)) | DBG_FUNC_NONE, + TRACE_INFO_STRING, arg[0], arg[1], arg[2], arg[3]); } @@ -1186,8 +1206,10 @@ kernel_debug_early( uintptr_t arg4) { /* If tracing is already initialized, use it */ - if (nkdbufs) + if (nkdbufs) { KERNEL_DEBUG_CONSTANT(debugid, arg1, arg2, arg3, arg4, 0); + return; + } /* Do nothing if the buffer is full or we're not on the boot cpu */ kd_early_overflow = kd_early_index >= KD_EARLY_BUFFER_MAX; @@ -1206,7 +1228,7 @@ kernel_debug_early( } /* - * Transfer the contents of the temporary buffer into the trace buffers. + * Transfen the contents of the temporary buffer into the trace buffers. * Precede that by logging the rebase time (offset) - the TSC-based time (in ns) * when mach_absolute_time is set to 0. */ @@ -1221,7 +1243,7 @@ kernel_debug_early_end(void) /* Fake sentinel marking the start of kernel time relative to TSC */ kernel_debug_enter( 0, - (TRACEDBG_CODE(DBG_TRACE_INFO, 1)) | DBG_FUNC_NONE, + TRACE_TIMESTAMPS, 0, (uint32_t)(tsc_rebase_abs_time >> 32), (uint32_t)tsc_rebase_abs_time, @@ -1243,7 +1265,7 @@ kernel_debug_early_end(void) /* Cut events-lost event on overflow */ if (kd_early_overflow) KERNEL_DEBUG_CONSTANT( - TRACEDBG_CODE(DBG_TRACE_INFO, 2), 0, 0, 0, 0, 0); + TRACE_LOST_EVENTS, 0, 0, 0, 0, 0); /* This trace marks the start of kernel tracing */ kernel_debug_string("early trace done"); @@ -2453,9 +2475,9 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) if (name[0] == KERN_KDWRITETR) { number = nkdbufs * sizeof(kd_buf); - KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_START, 0, 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT(TRACE_WRITING_EVENTS | DBG_FUNC_START, 0, 0, 0, 0, 0); ret = kdbg_read(0, &number, vp, &context); - KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_END, number, 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT(TRACE_WRITING_EVENTS | DBG_FUNC_END, number, 0, 0, 0, 0); *sizep = number; } else { @@ -2635,7 +2657,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) return EINVAL; memset(&lostevent, 0, sizeof(lostevent)); - lostevent.debugid = TRACEDBG_CODE(DBG_TRACE_INFO, 2); + lostevent.debugid = TRACE_LOST_EVENTS; /* Capture timestamp. Only sort events that have occured before the timestamp. * Since the iop is being flushed here, its possible that events occur on the AP @@ -3107,7 +3129,11 @@ start_kern_tracing(unsigned int new_nkdbufs, boolean_t need_map) /* Hold off interrupts until the early traces are cut */ boolean_t s = ml_set_interrupts_enabled(FALSE); - kdbg_set_tracing_enabled(TRUE, KDEBUG_ENABLE_TRACE); + kdbg_set_tracing_enabled( + TRUE, + kdebug_serial ? + (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_SERIAL) : + KDEBUG_ENABLE_TRACE); /* * Transfer all very early events from the static buffer @@ -3118,8 +3144,14 @@ start_kern_tracing(unsigned int new_nkdbufs, boolean_t need_map) ml_set_interrupts_enabled(s); printf("kernel tracing started\n"); +#if KDEBUG_MOJO_TRACE + if (kdebug_serial) { + printf("serial output enabled with %lu named events\n", + sizeof(kd_events)/sizeof(kd_event_t)); + } +#endif } else { - printf("error from kdbg_reinit,kernel tracing not started\n"); + printf("error from kdbg_reinit, kernel tracing not started\n"); } } @@ -3167,7 +3199,7 @@ kdbg_dump_trace_to_file(const char *filename) return; } } - KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 0)) | DBG_FUNC_NONE, 0, 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT(TRACE_PANIC | DBG_FUNC_NONE, 0, 0, 0, 0, 0); kdebug_enable = 0; kd_ctrl_page.enabled = 0; @@ -3209,3 +3241,146 @@ void kdbg_get_task_name(char* name_buf, int len, task_t task) else snprintf(name_buf, len, "%p [!bsd]", task); } + +#if KDEBUG_MOJO_TRACE +static kd_event_t * +binary_search(uint32_t id) +{ + int low, high, mid; + + low = 0; + high = sizeof(kd_events)/sizeof(kd_event_t) - 1; + + while (TRUE) + { + mid = (low + high) / 2; + + if (low > high) + return NULL; /* failed */ + else if ( low + 1 >= high) { + /* We have a match */ + if (kd_events[high].id == id) + return &kd_events[high]; + else if (kd_events[low].id == id) + return &kd_events[low]; + else + return NULL; /* search failed */ + } + else if (id < kd_events[mid].id) + high = mid; + else + low = mid; + } +} + +/* + * Look up event id to get name string. + * Using a per-cpu cache of a single entry + * before resorting to a binary search of the full table. + */ +#define NCACHE 1 +static kd_event_t *last_hit[MAX_CPUS]; +static kd_event_t * +event_lookup_cache(uint32_t cpu, uint32_t id) +{ + if (last_hit[cpu] == NULL || last_hit[cpu]->id != id) + last_hit[cpu] = binary_search(id); + return last_hit[cpu]; +} + +static uint64_t kd_last_timstamp; + +static void +kdebug_serial_print( + uint32_t cpunum, + uint32_t debugid, + uint64_t timestamp, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t threadid + ) +{ + char kprintf_line[192]; + char event[40]; + uint64_t us = timestamp / NSEC_PER_USEC; + uint64_t us_tenth = (timestamp % NSEC_PER_USEC) / 100; + uint64_t delta = timestamp - kd_last_timstamp; + uint64_t delta_us = delta / NSEC_PER_USEC; + uint64_t delta_us_tenth = (delta % NSEC_PER_USEC) / 100; + uint32_t event_id = debugid & DBG_FUNC_MASK; + const char *command; + const char *bra; + const char *ket; + kd_event_t *ep; + + /* event time and delta from last */ + snprintf(kprintf_line, sizeof(kprintf_line), + "%11llu.%1llu %8llu.%1llu ", + us, us_tenth, delta_us, delta_us_tenth); + + + /* event (id or name) - start prefixed by "[", end postfixed by "]" */ + bra = (debugid & DBG_FUNC_START) ? "[" : " "; + ket = (debugid & DBG_FUNC_END) ? "]" : " "; + ep = event_lookup_cache(cpunum, event_id); + if (ep) { + if (strlen(ep->name) < sizeof(event) - 3) + snprintf(event, sizeof(event), "%s%s%s", + bra, ep->name, ket); + else + snprintf(event, sizeof(event), "%s%x(name too long)%s", + bra, event_id, ket); + } else { + snprintf(event, sizeof(event), "%s%x%s", + bra, event_id, ket); + } + snprintf(kprintf_line + strlen(kprintf_line), + sizeof(kprintf_line) - strlen(kprintf_line), + "%-40s ", event); + + /* arg1 .. arg4 with special cases for strings */ + switch (event_id) { + case VFS_LOOKUP: + case VFS_LOOKUP_DONE: + if (debugid & DBG_FUNC_START) { + /* arg1 hex then arg2..arg4 chars */ + snprintf(kprintf_line + strlen(kprintf_line), + sizeof(kprintf_line) - strlen(kprintf_line), + "%-16lx %-8s%-8s%-8s ", + arg1, (char*)&arg2, (char*)&arg3, (char*)&arg4); + break; + } + /* else fall through for arg1..arg4 chars */ + case TRACE_STRING_EXEC: + case TRACE_STRING_NEWTHREAD: + case TRACE_INFO_STRING: + snprintf(kprintf_line + strlen(kprintf_line), + sizeof(kprintf_line) - strlen(kprintf_line), + "%-8s%-8s%-8s%-8s ", + (char*)&arg1, (char*)&arg2, (char*)&arg3, (char*)&arg4); + break; + default: + snprintf(kprintf_line + strlen(kprintf_line), + sizeof(kprintf_line) - strlen(kprintf_line), + "%-16lx %-16lx %-16lx %-16lx", + arg1, arg2, arg3, arg4); + } + + /* threadid, cpu and command name */ + if (threadid == (uintptr_t)thread_tid(current_thread()) && + current_proc() && + current_proc()->p_comm) + command = current_proc()->p_comm; + else + command = "-"; + snprintf(kprintf_line + strlen(kprintf_line), + sizeof(kprintf_line) - strlen(kprintf_line), + " %-16lx %-2d %s\n", + threadid, cpunum, command); + + kprintf("%s", kprintf_line); + kd_last_timstamp = timestamp; +} +#endif diff --git a/bsd/kern/kern_control.c b/bsd/kern/kern_control.c index 16a66ae82..8e5b0150b 100644 --- a/bsd/kern/kern_control.c +++ b/bsd/kern/kern_control.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2014 Apple Inc. All rights reserved. + * Copyright (c) 1999-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -317,6 +317,8 @@ ctl_connect(struct socket *so, struct sockaddr *nam, struct proc *p) struct sockaddr_ctl sa; struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb; struct ctl_cb *kcb_next = NULL; + u_quad_t sbmaxsize; + u_int32_t recvbufsize, sendbufsize; if (kcb == 0) panic("ctl_connect so_pcb null\n"); @@ -391,11 +393,27 @@ ctl_connect(struct socket *so, struct sockaddr *nam, struct proc *p) kctlstat.kcs_connections++; lck_mtx_unlock(ctl_mtx); - error = soreserve(so, kctl->sendbufsize, kctl->recvbufsize); + /* + * rdar://15526688: Limit the send and receive sizes to sb_max + * by using the same scaling as sbreserve() + */ + sbmaxsize = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES); + + if (kctl->sendbufsize > sbmaxsize) + sendbufsize = sbmaxsize; + else + sendbufsize = kctl->sendbufsize; + + if (kctl->recvbufsize > sbmaxsize) + recvbufsize = sbmaxsize; + else + recvbufsize = kctl->recvbufsize; + + error = soreserve(so, sendbufsize, recvbufsize); if (error) { printf("%s - soreserve(%llx, %u, %u) error %d\n", __func__, (uint64_t)VM_KERNEL_ADDRPERM(so), - kctl->sendbufsize, kctl->recvbufsize, error); + sendbufsize, recvbufsize, error); goto done; } soisconnecting(so); @@ -631,7 +649,7 @@ ctl_rcvbspace(struct kctl *kctl, struct socket *so, u_int32_t datasize, struct sockbuf *sb = &so->so_rcv; u_int32_t space = sbspace(sb); errno_t error; - + if ((kctl->flags & CTL_FLAG_REG_CRIT) == 0) { if ((u_int32_t) space >= datasize) error = 0; @@ -1116,10 +1134,9 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref) { struct kctl *kctl = NULL; struct kctl *kctl_next = NULL; - u_int32_t id = 1; - size_t name_len; - int is_extended = 0; - u_quad_t sbmaxsize; + u_int32_t id = 1; + size_t name_len; + int is_extended = 0; if (userkctl == NULL) /* sanity check */ return (EINVAL); @@ -1210,27 +1227,19 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref) /* * Let the caller know the default send and receive sizes - * - * rdar://15526688: Limit the send and receive sizes to sb_max - * by using the same scaling as sbreserve() */ - sbmaxsize = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES); - - if (userkctl->ctl_sendsize == 0) + if (userkctl->ctl_sendsize == 0) { kctl->sendbufsize = CTL_SENDSIZE; - else if (userkctl->ctl_sendsize > sbmaxsize) - kctl->sendbufsize = sbmaxsize; - else - kctl->sendbufsize = userkctl->ctl_sendsize; - userkctl->ctl_sendsize = kctl->sendbufsize; - - if (userkctl->ctl_recvsize == 0) + userkctl->ctl_sendsize = kctl->sendbufsize; + } else { + kctl->sendbufsize = userkctl->ctl_sendsize; + } + if (userkctl->ctl_recvsize == 0) { kctl->recvbufsize = CTL_RECVSIZE; - else if (userkctl->ctl_recvsize > sbmaxsize) - kctl->recvbufsize = sbmaxsize; - else - kctl->recvbufsize = userkctl->ctl_recvsize; - userkctl->ctl_recvsize = kctl->recvbufsize; + userkctl->ctl_recvsize = kctl->recvbufsize; + } else { + kctl->recvbufsize = userkctl->ctl_recvsize; + } kctl->connect = userkctl->ctl_connect; kctl->disconnect = userkctl->ctl_disconnect; diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c index 3b16c4ca0..708aef474 100644 --- a/bsd/kern/kern_event.c +++ b/bsd/kern/kern_event.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include @@ -415,6 +416,7 @@ kqlock2knotedrop(struct kqueue *kq, struct knote *kn) int oktodrop; oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0); + kn->kn_status &= ~KN_STAYQUEUED; kn->kn_status |= KN_DROPPING; if (oktodrop) { if (kn->kn_inuse == 0) { @@ -1180,6 +1182,7 @@ kqueue_alloc(struct proc *p) kq->kq_p = p; } else { FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE); + kq = NULL; } } @@ -2624,10 +2627,7 @@ knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link kern_return_t kr; kr = wait_queue_unlink_nofree(wq, kq->kq_wqs, wqlp); - kqlock(kq); - kn->kn_status &= ~KN_STAYQUEUED; - knote_dequeue(kn); - kqunlock(kq); + knote_clearstayqueued(kn); return ((kr != KERN_SUCCESS) ? EINVAL : 0); } @@ -3517,3 +3517,12 @@ knote_markstayqueued(struct knote *kn) knote_enqueue(kn); kqunlock(kn->kn_kq); } + +void +knote_clearstayqueued(struct knote *kn) +{ + kqlock(kn->kn_kq); + kn->kn_status &= ~KN_STAYQUEUED; + knote_dequeue(kn); + kqunlock(kn->kn_kq); +} diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index 3d2710538..4816a4891 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -642,6 +642,13 @@ exec_fat_imgact(struct image_params *imgp) int nfat_arch = 0, pr = 0, f = 0; nfat_arch = OSSwapBigToHostInt32(fat_header->nfat_arch); + + /* make sure bogus nfat_arch doesn't cause chaos - 19376072 */ + if ( (sizeof(struct fat_header) + (nfat_arch * sizeof(struct fat_arch))) > PAGE_SIZE ) { + error = EBADEXEC; + goto bad; + } + /* Check each preference listed against all arches in header */ for (pr = 0; pr < NBINPREFS; pr++) { cpu_type_t pref = psa->psa_binprefs[pr]; @@ -1114,14 +1121,14 @@ grade: kdbg_trace_string(p, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4); if (vfexec || spawn) { - KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE, + KERNEL_DEBUG_CONSTANT1(TRACE_DATA_EXEC | DBG_FUNC_NONE, p->p_pid ,0,0,0, (uintptr_t)thread_tid(thread)); - KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE, + KERNEL_DEBUG_CONSTANT1(TRACE_STRING_EXEC | DBG_FUNC_NONE, dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, (uintptr_t)thread_tid(thread)); } else { - KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE, + KERNEL_DEBUG_CONSTANT(TRACE_DATA_EXEC | DBG_FUNC_NONE, p->p_pid ,0,0,0,0); - KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE, + KERNEL_DEBUG_CONSTANT(TRACE_STRING_EXEC | DBG_FUNC_NONE, dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0); } } @@ -2429,7 +2436,7 @@ bad: /* notify only if it has not failed due to FP Key error */ if ((p->p_lflag & P_LTERM_DECRYPTFAIL) == 0) proc_knote(p, NOTE_EXEC); - } else { + } else if (error == 0) { /* reset the importance attribute from our previous life */ task_importance_reset(p->task); diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index c1bff3128..3d17f687c 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -149,6 +149,7 @@ extern void dtrace_lazy_dofs_destroy(proc_t); #include +extern boolean_t init_task_died; extern char init_task_failure_data[]; void proc_prepareexit(proc_t p, int rv, boolean_t perf_notify); void vfork_exit(proc_t p, int rv); @@ -354,6 +355,7 @@ exit1_internal(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, bo sync(p, (void *)NULL, (int *)NULL); } #endif + init_task_died = TRUE; panic("%s died\nState at Last Exception:\n\n%s", (p->p_comm[0] != '\0' ? p->p_comm : diff --git a/bsd/kern/kern_prot.c b/bsd/kern/kern_prot.c index 1301dbeea..23c602e8b 100644 --- a/bsd/kern/kern_prot.c +++ b/bsd/kern/kern_prot.c @@ -720,42 +720,34 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) kauth_cred_t my_cred, my_new_cred; posix_cred_t my_pcred; - uid = uap->uid; + /* get current credential and take a reference while we muck with it */ my_cred = kauth_cred_proc_ref(p); my_pcred = posix_cred_get(my_cred); DEBUG_CRED_ENTER("setuid (%d/%d): %p %d\n", p->p_pid, (p->p_pptr ? p->p_pptr->p_pid : 0), my_cred, uap->uid); AUDIT_ARG(uid, uid); - if (uid != my_pcred->cr_ruid && /* allow setuid(getuid()) */ - uid != my_pcred->cr_svuid && /* allow setuid(saved uid) */ - (error = suser(my_cred, &p->p_acflag))) { - kauth_cred_unref(&my_cred); - return (error); - } - /* - * Everything's okay, do it. - */ + for (;;) { + if (uid != my_pcred->cr_ruid && /* allow setuid(getuid()) */ + uid != my_pcred->cr_svuid && /* allow setuid(saved uid) */ + (error = suser(my_cred, &p->p_acflag))) { + kauth_cred_unref(&my_cred); + return (error); + } - /* - * If we are priviledged, then set the saved and real UID too; - * otherwise, just set the effective UID - */ - if (suser(my_cred, &p->p_acflag) == 0) { - svuid = uid; - ruid = uid; /* - * Transfer proc count to new user. - * chgproccnt uses list lock for protection + * If we are privileged, then set the saved and real UID too; + * otherwise, just set the effective UID */ - (void)chgproccnt(uid, 1); - (void)chgproccnt(my_pcred->cr_ruid, -1); - } - - /* get current credential and take a reference while we muck with it */ - for (;;) { + if (suser(my_cred, &p->p_acflag) == 0) { + svuid = uid; + ruid = uid; + } else { + svuid = KAUTH_UID_NONE; + ruid = KAUTH_UID_NONE; + } /* * Only set the gmuid if the current cred has not opt'ed out; * this normally only happens when calling setgroups() instead @@ -780,17 +772,39 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) DEBUG_CRED_CHANGE("setuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); + /* + * If we're changing the ruid from A to B, we might race with another thread that's setting ruid from B to A. + * The current locking mechanisms don't allow us to make the entire credential switch operation atomic, + * thus we may be able to change the process credentials from ruid A to B, but get preempted before incrementing the proc + * count of B. If a second thread sees the new process credentials and switches back to ruid A, that other thread + * may be able to decrement the proc count of B before we can increment it. This results in a panic. + * Incrementing the proc count of the target ruid, B, before setting the process credentials prevents this race. + */ + if (ruid != KAUTH_UID_NONE) { + (void)chgproccnt(ruid, 1); + } + proc_lock(p); /* * We need to protect for a race where another thread * also changed the credential after we took our * reference. If p_ucred has changed then we should * restart this again with the new cred. + * + * Note: the kauth_cred_setresuid has consumed a reference to my_cred, it p_ucred != my_cred, then my_cred must not be dereferenced! */ if (p->p_ucred != my_cred) { proc_unlock(p); + /* + * We didn't successfully switch to the new ruid, so decrement + * the procs/uid count that we incremented above. + */ + if (ruid != KAUTH_UID_NONE) { + (void)chgproccnt(ruid, -1); + } kauth_cred_unref(&my_new_cred); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); /* try again */ continue; } @@ -800,6 +814,13 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); + /* + * If we've updated the ruid, decrement the count of procs running + * under the previous ruid + */ + if (ruid != KAUTH_UID_NONE) { + (void)chgproccnt(my_pcred->cr_ruid, -1); + } } break; } @@ -845,18 +866,14 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) my_cred = kauth_cred_proc_ref(p); my_pcred = posix_cred_get(my_cred); - if (euid != my_pcred->cr_ruid && euid != my_pcred->cr_svuid && - (error = suser(my_cred, &p->p_acflag))) { - kauth_cred_unref(&my_cred); - return (error); - } - - /* - * Everything's okay, do it. Copy credentials so other references do - * not see our changes. get current credential and take a reference - * while we muck with it - */ for (;;) { + + if (euid != my_pcred->cr_ruid && euid != my_pcred->cr_svuid && + (error = suser(my_cred, &p->p_acflag))) { + kauth_cred_unref(&my_cred); + return (error); + } + /* * Set the credential with new info. If there is no change, * we get back the same credential we passed in; if there is @@ -881,6 +898,7 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) proc_unlock(p); kauth_cred_unref(&my_new_cred); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); /* try again */ continue; } @@ -953,32 +971,25 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) my_cred = kauth_cred_proc_ref(p); my_pcred = posix_cred_get(my_cred); - if (((ruid != KAUTH_UID_NONE && /* allow no change of ruid */ - ruid != my_pcred->cr_ruid && /* allow ruid = ruid */ - ruid != my_pcred->cr_uid && /* allow ruid = euid */ - ruid != my_pcred->cr_svuid) || /* allow ruid = svuid */ - (euid != KAUTH_UID_NONE && /* allow no change of euid */ - euid != my_pcred->cr_uid && /* allow euid = euid */ - euid != my_pcred->cr_ruid && /* allow euid = ruid */ - euid != my_pcred->cr_svuid)) && /* allow euid = svui */ - (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */ - kauth_cred_unref(&my_cred); - return (error); - } - - /* - * Everything's okay, do it. Copy credentials so other references do - * not see our changes. get current credential and take a reference - * while we muck with it - */ for (;;) { + + if (((ruid != KAUTH_UID_NONE && /* allow no change of ruid */ + ruid != my_pcred->cr_ruid && /* allow ruid = ruid */ + ruid != my_pcred->cr_uid && /* allow ruid = euid */ + ruid != my_pcred->cr_svuid) || /* allow ruid = svuid */ + (euid != KAUTH_UID_NONE && /* allow no change of euid */ + euid != my_pcred->cr_uid && /* allow euid = euid */ + euid != my_pcred->cr_ruid && /* allow euid = ruid */ + euid != my_pcred->cr_svuid)) && /* allow euid = svuid */ + (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */ + kauth_cred_unref(&my_cred); + return (error); + } + uid_t new_euid; - uid_t new_ruid; uid_t svuid = KAUTH_UID_NONE; new_euid = my_pcred->cr_uid; - new_ruid = my_pcred->cr_ruid; - /* * Set the credential with new info. If there is no change, * we get back the same credential we passed in; if there is @@ -986,19 +997,11 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) * passed in. The subsequent compare is safe, because it is * a pointer compare rather than a contents compare. */ - if (euid == KAUTH_UID_NONE && my_pcred->cr_uid != euid) { + if (euid != KAUTH_UID_NONE && my_pcred->cr_uid != euid) { /* changing the effective UID */ new_euid = euid; OSBitOrAtomic(P_SUGID, &p->p_flag); } - if (ruid != KAUTH_UID_NONE && my_pcred->cr_ruid != ruid) { - /* changing the real UID; must do user accounting */ - /* chgproccnt uses list lock for protection */ - (void)chgproccnt(ruid, 1); - (void)chgproccnt(my_pcred->cr_ruid, -1); - new_ruid = ruid; - OSBitOrAtomic(P_SUGID, &p->p_flag); - } /* * If the newly requested real uid or effective uid does * not match the saved uid, then set the saved uid to the @@ -1017,25 +1020,56 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) DEBUG_CRED_CHANGE("setreuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); + /* + * If we're changing the ruid from A to B, we might race with another thread that's setting ruid from B to A. + * The current locking mechanisms don't allow us to make the entire credential switch operation atomic, + * thus we may be able to change the process credentials from ruid A to B, but get preempted before incrementing the proc + * count of B. If a second thread sees the new process credentials and switches back to ruid A, that other thread + * may be able to decrement the proc count of B before we can increment it. This results in a panic. + * Incrementing the proc count of the target ruid, B, before setting the process credentials prevents this race. + */ + if (ruid != KAUTH_UID_NONE) { + (void)chgproccnt(ruid, 1); + } + proc_lock(p); /* * We need to protect for a race where another thread * also changed the credential after we took our * reference. If p_ucred has changed then we should * restart this again with the new cred. + * + * Note: the kauth_cred_setresuid has consumed a reference to my_cred, it p_ucred != my_cred, then my_cred must not be dereferenced! */ if (p->p_ucred != my_cred) { proc_unlock(p); + if (ruid != KAUTH_UID_NONE) { + /* + * We didn't successfully switch to the new ruid, so decrement + * the procs/uid count that we incremented above. + */ + (void)chgproccnt(ruid, -1); + } kauth_cred_unref(&my_new_cred); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); /* try again */ continue; } + p->p_ucred = my_new_cred; /* update cred on proc */ PROC_UPDATE_CREDS_ONPROC(p); - OSBitOrAtomic(P_SUGID, &p->p_flag); /* XXX redundant? */ + OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); + + if (ruid != KAUTH_UID_NONE) { + /* + * We switched to a new ruid, so decrement the count of procs running + * under the previous ruid + */ + (void)chgproccnt(my_pcred->cr_ruid, -1); + } } break; } @@ -1087,28 +1121,30 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval) gid = uap->gid; AUDIT_ARG(gid, gid); + /* get current credential and take a reference while we muck with it */ my_cred = kauth_cred_proc_ref(p); my_pcred = posix_cred_get(my_cred); - if (gid != my_pcred->cr_rgid && /* allow setgid(getgid()) */ - gid != my_pcred->cr_svgid && /* allow setgid(saved gid) */ - (error = suser(my_cred, &p->p_acflag))) { - kauth_cred_unref(&my_cred); - return (error); - } + for (;;) { + if (gid != my_pcred->cr_rgid && /* allow setgid(getgid()) */ + gid != my_pcred->cr_svgid && /* allow setgid(saved gid) */ + (error = suser(my_cred, &p->p_acflag))) { + kauth_cred_unref(&my_cred); + return (error); + } - /* - * If we are priviledged, then set the saved and real GID too; - * otherwise, just set the effective GID - */ - if (suser(my_cred, &p->p_acflag) == 0) { - svgid = gid; - rgid = gid; - } + /* + * If we are privileged, then set the saved and real GID too; + * otherwise, just set the effective GID + */ + if (suser(my_cred, &p->p_acflag) == 0) { + svgid = gid; + rgid = gid; + } else { + svgid = KAUTH_GID_NONE; + rgid = KAUTH_GID_NONE; + } - /* get current credential and take a reference while we muck with it */ - for (;;) { - /* * Set the credential with new info. If there is no change, * we get back the same credential we passed in; if there is @@ -1133,6 +1169,7 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval) kauth_cred_unref(&my_new_cred); /* try again */ my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); continue; } p->p_ucred = my_new_cred; @@ -1187,18 +1224,18 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) egid = uap->egid; AUDIT_ARG(egid, egid); + /* get current credential and take a reference while we muck with it */ my_cred = kauth_cred_proc_ref(p); my_pcred = posix_cred_get(my_cred); - if (egid != my_pcred->cr_rgid && - egid != my_pcred->cr_svgid && - (error = suser(my_cred, &p->p_acflag))) { - kauth_cred_unref(&my_cred); - return (error); - } - /* get current credential and take a reference while we muck with it */ for (;;) { + if (egid != my_pcred->cr_rgid && + egid != my_pcred->cr_svgid && + (error = suser(my_cred, &p->p_acflag))) { + kauth_cred_unref(&my_cred); + return (error); + } /* * Set the credential with new info. If there is no change, * we get back the same credential we passed in; if there is @@ -1223,6 +1260,7 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) kauth_cred_unref(&my_new_cred); /* try again */ my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); continue; } p->p_ucred = my_new_cred; @@ -1298,25 +1336,26 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) AUDIT_ARG(egid, egid); AUDIT_ARG(rgid, rgid); + /* get current credential and take a reference while we muck with it */ my_cred = kauth_cred_proc_ref(p); my_pcred = posix_cred_get(my_cred); - if (((rgid != KAUTH_UID_NONE && /* allow no change of rgid */ - rgid != my_pcred->cr_rgid && /* allow rgid = rgid */ - rgid != my_pcred->cr_gid && /* allow rgid = egid */ - rgid != my_pcred->cr_svgid) || /* allow rgid = svgid */ - (egid != KAUTH_UID_NONE && /* allow no change of egid */ - egid != my_pcred->cr_groups[0] && /* allow no change of egid */ - egid != my_pcred->cr_gid && /* allow egid = egid */ - egid != my_pcred->cr_rgid && /* allow egid = rgid */ - egid != my_pcred->cr_svgid)) && /* allow egid = svgid */ - (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */ - kauth_cred_unref(&my_cred); - return (error); - } - - /* get current credential and take a reference while we muck with it */ for (;;) { + + if (((rgid != KAUTH_UID_NONE && /* allow no change of rgid */ + rgid != my_pcred->cr_rgid && /* allow rgid = rgid */ + rgid != my_pcred->cr_gid && /* allow rgid = egid */ + rgid != my_pcred->cr_svgid) || /* allow rgid = svgid */ + (egid != KAUTH_UID_NONE && /* allow no change of egid */ + egid != my_pcred->cr_groups[0] && /* allow no change of egid */ + egid != my_pcred->cr_gid && /* allow egid = egid */ + egid != my_pcred->cr_rgid && /* allow egid = rgid */ + egid != my_pcred->cr_svgid)) && /* allow egid = svgid */ + (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */ + kauth_cred_unref(&my_cred); + return (error); + } + uid_t new_egid = my_pcred->cr_gid; uid_t new_rgid = my_pcred->cr_rgid; uid_t svgid = KAUTH_UID_NONE; @@ -1329,7 +1368,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) * passed in. The subsequent compare is safe, because it is * a pointer compare rather than a contents compare. */ - if (egid == KAUTH_UID_NONE && my_pcred->cr_gid != egid) { + if (egid != KAUTH_UID_NONE && my_pcred->cr_gid != egid) { /* changing the effective GID */ new_egid = egid; OSBitOrAtomic(P_SUGID, &p->p_flag); @@ -1367,6 +1406,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) kauth_cred_unref(&my_new_cred); /* try again */ my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); continue; } p->p_ucred = my_new_cred; @@ -1387,7 +1427,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) /* * Set the per-thread override identity. The first parameter can be the - * current real UID, KAUTH_UID_NONE, or, if the caller is priviledged, it + * current real UID, KAUTH_UID_NONE, or, if the caller is privileged, it * can be any UID. If it is KAUTH_UID_NONE, then as a special case, this * means "revert to the per process credential"; otherwise, if permitted, * it changes the effective, real, and saved UIDs and GIDs for the current diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index 251629b5b..c7978d82e 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -180,6 +180,11 @@ extern unsigned int vm_page_free_reserved; extern unsigned int vm_page_speculative_percentage; extern unsigned int vm_page_speculative_q_age_ms; +#if (DEVELOPMENT || DEBUG) +extern uint32_t vm_page_creation_throttled_hard; +extern uint32_t vm_page_creation_throttled_soft; +#endif /* DEVELOPMENT || DEBUG */ + /* * Conditionally allow dtrace to see these functions for debugging purposes. */ @@ -2660,6 +2665,7 @@ SYSCTL_INT(_vm, OID_AUTO, vm_page_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, & SYSCTL_INT(_vm, OID_AUTO, vm_page_filecache_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_page_filecache_min, 0, ""); extern int vm_compressor_mode; +extern int vm_compressor_is_active; extern uint32_t swapout_target_age; extern int64_t compressor_bytes_used; extern uint32_t compressor_eval_period_in_msecs; @@ -2673,6 +2679,7 @@ extern uint32_t vm_compressor_unthrottle_threshold_divisor; extern uint32_t vm_compressor_catchup_threshold_divisor; SYSCTL_INT(_vm, OID_AUTO, compressor_mode, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_mode, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, compressor_is_active, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_is_active, 0, ""); SYSCTL_QUAD(_vm, OID_AUTO, compressor_bytes_used, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_bytes_used, ""); SYSCTL_INT(_vm, OID_AUTO, compressor_swapout_target_age, CTLFLAG_RD | CTLFLAG_LOCKED, &swapout_target_age, 0, ""); @@ -2699,6 +2706,18 @@ SYSCTL_INT(_vm, OID_AUTO, phantom_cache_thrashing_threshold, CTLFLAG_RW | CTLFLA SYSCTL_INT(_vm, OID_AUTO, phantom_cache_thrashing_threshold_ssd, CTLFLAG_RW | CTLFLAG_LOCKED, &phantom_cache_thrashing_threshold_ssd, 0, ""); #endif +#if (DEVELOPMENT || DEBUG) + +SYSCTL_UINT(_vm, OID_AUTO, vm_page_creation_throttled_hard, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_creation_throttled_hard, 0, ""); + +SYSCTL_UINT(_vm, OID_AUTO, vm_page_creation_throttled_soft, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_creation_throttled_soft, 0, ""); + +#endif /* DEVELOPMENT || DEBUG */ + /* * Enable tracing of voucher contents */ diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index c590c52d6..0f477bdf0 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -113,7 +113,8 @@ static load_result_t load_result_null = { .csflags = 0, .uuid = { 0 }, .min_vm_addr = MACH_VM_MAX_ADDRESS, - .max_vm_addr = MACH_VM_MIN_ADDRESS + .max_vm_addr = MACH_VM_MIN_ADDRESS, + .cs_end_offset = 0 }; /* @@ -772,6 +773,37 @@ parse_machfile( } else { got_code_signatures = TRUE; } + + if (got_code_signatures) { + boolean_t valid = FALSE, tainted = TRUE; + struct cs_blob *blobs; + vm_size_t off = 0; + + + if (cs_debug > 10) + printf("validating initial pages of %s\n", vp->v_name); + blobs = ubc_get_cs_blobs(vp); + + while (off < size && ret == LOAD_SUCCESS) { + valid = cs_validate_page(blobs, + NULL, + file_offset + off, + addr + off, + &tainted); + if (!valid || tainted) { + if (cs_debug) + printf("CODE SIGNING: %s[%d]: invalid initial page at offset %lld validated:%d tainted:%d csflags:0x%x\n", + vp->v_name, p->p_pid, (long long)(file_offset + off), valid, tainted, result->csflags); + if (cs_enforcement(NULL) || + (result->csflags & (CS_HARD|CS_KILL|CS_ENFORCEMENT))) { + ret = LOAD_FAILURE; + } + result->csflags &= ~CS_VALID; + } + off += PAGE_SIZE; + } + } + break; #if CONFIG_CODE_DECRYPTION case LC_ENCRYPTION_INFO: @@ -991,6 +1023,20 @@ load_segment( if ((scp->fileoff & PAGE_MASK_64) != 0) return (LOAD_BADMACHO); + /* + * If we have a code signature attached for this slice + * require that the segments are within the signed part + * of the file. + */ + if (result->cs_end_offset && + result->cs_end_offset < (off_t)scp->fileoff && + result->cs_end_offset - scp->fileoff < scp->filesize) + { + if (cs_debug) + printf("section outside code signature\n"); + return LOAD_BADMACHO; + } + /* * Round sizes to page size. */ @@ -1290,25 +1336,46 @@ load_threadstate( uint32_t size; int flavor; uint32_t thread_size; + uint32_t *local_ts; + uint32_t local_ts_size; - ret = thread_state_initialize( thread ); - if (ret != KERN_SUCCESS) { - return(LOAD_FAILURE); - } + local_ts = NULL; + local_ts_size = 0; + + ret = thread_state_initialize( thread ); + if (ret != KERN_SUCCESS) { + ret = LOAD_FAILURE; + goto done; + } + if (total_size > 0) { + local_ts_size = total_size; + local_ts = kalloc(local_ts_size); + if (local_ts == NULL) { + ret = LOAD_FAILURE; + goto done; + } + memcpy(local_ts, ts, local_ts_size); + ts = local_ts; + } + /* - * Set the new thread state; iterate through the state flavors in - * the mach-o file. + * Set the new thread state; iterate through the state flavors in + * the mach-o file. */ while (total_size > 0) { flavor = *ts++; size = *ts++; if (UINT32_MAX-2 < size || - UINT32_MAX/sizeof(uint32_t) < size+2) - return (LOAD_BADMACHO); + UINT32_MAX/sizeof(uint32_t) < size+2) { + ret = LOAD_BADMACHO; + goto done; + } thread_size = (size+2)*sizeof(uint32_t); - if (thread_size > total_size) - return(LOAD_BADMACHO); + if (thread_size > total_size) { + ret = LOAD_BADMACHO; + goto done; + } total_size -= thread_size; /* * Third argument is a kernel space pointer; it gets cast @@ -1317,11 +1384,19 @@ load_threadstate( */ ret = thread_setstatus(thread, flavor, (thread_state_t)ts, size); if (ret != KERN_SUCCESS) { - return(LOAD_FAILURE); + ret = LOAD_FAILURE; + goto done; } ts += size; /* ts is a (uint32_t *) */ } - return(LOAD_SUCCESS); + ret = LOAD_SUCCESS; + +done: + if (local_ts != NULL) { + kfree(local_ts, local_ts_size); + local_ts = NULL; + } + return ret; } static @@ -1584,7 +1659,7 @@ load_code_signature( goto out; } - blob = ubc_cs_blob_get(vp, cputype, -1); + blob = ubc_cs_blob_get(vp, cputype, macho_offset); if (blob != NULL) { /* we already have a blob for this vnode and cputype */ if (blob->csb_cpu_type == cputype && @@ -1644,13 +1719,14 @@ load_code_signature( ubc_cs_validation_bitmap_allocate( vp ); #endif - blob = ubc_cs_blob_get(vp, cputype, -1); + blob = ubc_cs_blob_get(vp, cputype, macho_offset); ret = LOAD_SUCCESS; out: if (ret == LOAD_SUCCESS) { result->csflags |= blob->csb_flags; result->platform_binary = blob->csb_platform_binary; + result->cs_end_offset = blob->csb_end_offset; } if (addr != 0) { ubc_cs_blob_deallocate(addr, blob_size); diff --git a/bsd/kern/mach_loader.h b/bsd/kern/mach_loader.h index d1c83d1f9..dc0dbfa5b 100644 --- a/bsd/kern/mach_loader.h +++ b/bsd/kern/mach_loader.h @@ -70,6 +70,7 @@ typedef struct _load_result { mach_vm_address_t min_vm_addr; mach_vm_address_t max_vm_addr; unsigned int platform_binary; + off_t cs_end_offset; } load_result_t; struct image_params; diff --git a/bsd/kern/makekdebugevents.py b/bsd/kern/makekdebugevents.py new file mode 100755 index 000000000..73b2db49e --- /dev/null +++ b/bsd/kern/makekdebugevents.py @@ -0,0 +1,38 @@ +#!/usr/bin/python +# +# This script scans the trace.codes file, containing a mapping of event id to +# event name for all events, and writes to stdout a C declaration for a table +# named kd_events[] or these mappings. +# Required to generate a header file used by DEVELOPMENT and DEBUG kernels. +# + +import sys +import re + +# we expect one arg specifying the path to the trace.codes file +if (len(sys.argv) < 2): + exit(1) +trace_code_file = sys.argv[1] + +# regular expression pattern to match +id_name_pattern = re.compile('0x([0-9a-fA-F]+)\s+([^\s]*)') +code_table = [] + +# scan file to generate internal table +with open(trace_code_file, 'rt') as codes: + for line in codes: + m = id_name_pattern.match(line) + if m: + code_table += [(int(m.group(1),base=16), m.group(2))] + +# emit typedef: +print "typedef struct {" +print " uint32_t id;" +print " const char *name;" +print "} kd_event_t;" +# emit structure declaration and sorted initialization: +print "kd_event_t kd_events[] = {" +for mapping in sorted(code_table, key=lambda x: x[0]): + print " {0x%x, \"%s\"}," % mapping +print "};" + diff --git a/bsd/kern/proc_info.c b/bsd/kern/proc_info.c index 2af5cc29e..a2b82a6e4 100644 --- a/bsd/kern/proc_info.c +++ b/bsd/kern/proc_info.c @@ -1663,6 +1663,7 @@ fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo) struct stat64 sb; int error = 0; + bzero(&sb, sizeof(struct stat64)); context = vfs_context_create((vfs_context_t)0); error = vn_stat(vp, &sb, NULL, 1, context); (void)vfs_context_rele(context); diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index c839e868f..1247ff355 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -1487,7 +1487,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) /* Handle input events */ if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP )) { kev.filter = EVFILT_READ; - if (!(events & ( POLLIN | POLLRDNORM ))) + if (events & ( POLLPRI | POLLRDBAND )) kev.flags |= EV_OOBAND; kerror = kevent_register(kq, &kev, p); } @@ -1559,7 +1559,7 @@ poll_callback(__unused struct kqueue *kq, struct kevent64_s *kevp, void *data) struct poll_continue_args *cont = (struct poll_continue_args *)data; struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata); short prev_revents = fds->revents; - short mask; + short mask = 0; /* convert the results back into revents */ if (kevp->flags & EV_EOF) @@ -1572,7 +1572,8 @@ poll_callback(__unused struct kqueue *kq, struct kevent64_s *kevp, void *data) if (fds->revents & POLLHUP) mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND ); else { - mask = (POLLIN | POLLRDNORM ); + if ((kevp->flags & EV_ERROR) == 0 && kevp->data != 0) + mask = (POLLIN | POLLRDNORM ); if (kevp->flags & EV_OOBAND) mask |= ( POLLPRI | POLLRDBAND ); } diff --git a/bsd/kern/trace.codes b/bsd/kern/trace.codes index e08f59ff5..57de6588d 100644 --- a/bsd/kern/trace.codes +++ b/bsd/kern/trace.codes @@ -321,6 +321,7 @@ 0x1a2001c SFI_WAIT_CANCELED 0x1a20020 SFI_PID_SET_MANAGED 0x1a20024 SFI_PID_CLEAR_MANAGED +0x1a20028 SFI_GLOBAL_DEFER 0x1a30004 ENERGY_PERF_GPU_DESCRIPTION 0x1a30008 ENERGY_PERF_GPU_TIME 0x2010000 L_IP_In_Beg @@ -1550,6 +1551,7 @@ 0x5310278 CPUPM_PST_UIB 0x531027C CPUPM_PST_PLIMIT_UIB 0x5310280 CPUPM_IO +0x5310284 CPUPM_FI 0x5330000 HIBERNATE 0x5330004 HIBERNATE_WRITE_IMAGE 0x5330008 HIBERNATE_MACHINE_INIT diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index 6b57b3cf8..2dc33d759 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2014 Apple Inc. All rights reserved. + * Copyright (c) 1998-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -5392,6 +5392,17 @@ soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; + /* + * If the caller explicitly asked for OOB results (e.g. poll()), + * save that off in the hookid field and reserve the kn_flags + * EV_OOBAND bit for output only). + */ + if (kn->kn_flags & EV_OOBAND) { + kn->kn_flags &= ~EV_OOBAND; + kn->kn_hookid = EV_OOBAND; + } else { + kn->kn_hookid = 0; + } skl = &so->so_rcv.sb_sel.si_note; break; case EVFILT_WRITE: @@ -5467,44 +5478,42 @@ filt_soread(struct knote *kn, long hint) } /* socket isn't a listener */ - kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; + /* + * Clear out EV_OOBAND that filt_soread may have set in the + * past. + */ + kn->kn_flags &= ~EV_OOBAND; - if (so->so_oobmark) { - if (kn->kn_flags & EV_OOBAND) { + if ((so->so_oobmark) || (so->so_state & SS_RCVATMARK)){ + kn->kn_flags |= EV_OOBAND; + /* + * If caller registered explicit interest in OOB data, + * return immediately (data == amount beyond mark, for + * legacy reasons - that should be changed later). + */ + if (kn->kn_hookid == EV_OOBAND) { + /* + * When so_state is SS_RCVATMARK, so_oobmark + * is 0. + */ kn->kn_data -= so->so_oobmark; if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_unlock(so, 1); return (1); } - kn->kn_data = so->so_oobmark; - kn->kn_flags |= EV_OOBAND; - } else { - if ((so->so_state & SS_CANTRCVMORE) + } + + if ((so->so_state & SS_CANTRCVMORE) #if CONTENT_FILTER - && cfil_sock_data_pending(&so->so_rcv) == 0 + && cfil_sock_data_pending(&so->so_rcv) == 0 #endif /* CONTENT_FILTER */ - ) { - kn->kn_flags |= EV_EOF; - kn->kn_fflags = so->so_error; - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); - return (1); - } - } - - if (so->so_state & SS_RCVATMARK) { - if (kn->kn_flags & EV_OOBAND) { - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); - return (1); - } - kn->kn_flags |= EV_OOBAND; - } else if (kn->kn_flags & EV_OOBAND) { - kn->kn_data = 0; + ) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_unlock(so, 1); - return (0); + return (1); } if (so->so_error) { /* temporary udp error */ @@ -5524,7 +5533,7 @@ filt_soread(struct knote *kn, long hint) if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_unlock(so, 1); - return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat); + return (kn->kn_data >= lowwat); } static void diff --git a/bsd/man/man2/kqueue.2 b/bsd/man/man2/kqueue.2 index 201ad5de3..c3e668072 100644 --- a/bsd/man/man2/kqueue.2 +++ b/bsd/man/man2/kqueue.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2008 Apple Inc. All rights reserved. +.\" Copyright (c) 2008-2015 Apple Inc. All rights reserved. .\" .\" @APPLE_LICENSE_HEADER_START@ .\" @@ -281,6 +281,9 @@ instead of the current state. Note that some filters may automatically set this flag internally. .It EV_EOF Filters may set this flag to indicate filter-specific EOF condition. +.It EV_OOBAND +Read filter on socket may set this flag to indicate the presence of out of +band data on the descriptor. .It EV_ERROR See .Sx RETURN VALUES @@ -329,6 +332,12 @@ On return, .Va data contains the number of bytes of protocol data available to read. .Pp +The presence of EV_OOBAND in +.Va flags , +indicates the presence of out of band data on the socket +.Va data +equal to the potential number of OOB bytes availble to read. +.Pp If the read direction of the socket has shutdown, then the filter also sets EV_EOF in .Va flags , diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index 0fd816243..fe762c21f 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -2401,10 +2401,9 @@ filt_specdetach(struct knote *kn) if (ret != KERN_SUCCESS) { panic("filt_specdetach(): failed to unlink wait queue link."); } - + knote_clearstayqueued(kn); (void)wait_queue_link_free(kn->kn_hook); kn->kn_hook = NULL; - kn->kn_status &= ~KN_STAYQUEUED; } static int diff --git a/bsd/net/if_bridge.c b/bsd/net/if_bridge.c index 37bea9581..98fff2803 100644 --- a/bsd/net/if_bridge.c +++ b/bsd/net/if_bridge.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2014 Apple Inc. All rights reserved. + * Copyright (c) 2004-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -2087,6 +2087,14 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, BRIDGE_LOCK_ASSERT_HELD(sc); VERIFY(ifs != NULL); + /* + * First, remove the member from the list first so it cannot be found anymore + * when we release the bridge lock below + */ + BRIDGE_XLOCK(sc); + TAILQ_REMOVE(&sc->sc_iflist, bif, bif_next); + BRIDGE_XDROP(sc); + if (!gone) { switch (ifs->if_type) { case IFT_ETHER: @@ -2094,8 +2102,15 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, /* * Take the interface out of promiscuous mode. */ - if (bif->bif_flags & BIFF_PROMISC) + if (bif->bif_flags & BIFF_PROMISC) { + /* + * Unlock to prevent deadlock with bridge_iff_event() in + * case the driver generates an interface event + */ + BRIDGE_UNLOCK(sc); (void) ifnet_set_promiscuous(ifs, 0); + BRIDGE_LOCK(sc); + } break; case IFT_GIF: @@ -2123,10 +2138,6 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, bstp_disable(&bif->bif_stp); #endif /* BRIDGESTP */ - BRIDGE_XLOCK(sc); - TAILQ_REMOVE(&sc->sc_iflist, bif, bif_next); - BRIDGE_XDROP(sc); - /* * If removing the interface that gave the bridge its mac address, set * the mac address of the bridge to the address of the next member, or diff --git a/bsd/netinet/in_systm.h b/bsd/netinet/in_systm.h index 3ea2612a6..e0972f831 100644 --- a/bsd/netinet/in_systm.h +++ b/bsd/netinet/in_systm.h @@ -85,6 +85,6 @@ typedef __uint32_t n_long; /* long as received from the net */ typedef __uint32_t n_time; /* ms since 00:00 GMT, byte rev */ #ifdef BSD_KERNEL_PRIVATE -n_time iptime(void); +u_int32_t iptime(void); #endif /* BSD_KERNEL_PRIVATE */ #endif /* _NETINET_IN_SYSTM_H_ */ diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index ba2869a50..256d54b8f 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -124,24 +124,31 @@ */ struct icmpstat icmpstat; -SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, - &icmpstat, icmpstat, ""); +SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, + CTLFLAG_RD | CTLFLAG_LOCKED, + &icmpstat, icmpstat, ""); static int icmpmaskrepl = 0; -SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW | CTLFLAG_LOCKED, - &icmpmaskrepl, 0, ""); +SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, + CTLFLAG_RW | CTLFLAG_LOCKED, + &icmpmaskrepl, 0, ""); static int icmptimestamp = 0; -SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp, CTLFLAG_RW | CTLFLAG_LOCKED, - &icmptimestamp, 0, ""); +SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp, + CTLFLAG_RW | CTLFLAG_LOCKED, + &icmptimestamp, 0, ""); -static int drop_redirect = 0; -SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW | CTLFLAG_LOCKED, - &drop_redirect, 0, ""); +static int drop_redirect = 1; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, + CTLFLAG_RW | CTLFLAG_LOCKED, + &drop_redirect, 0, ""); static int log_redirect = 0; -SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW | CTLFLAG_LOCKED, - &log_redirect, 0, ""); +SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, + CTLFLAG_RW | CTLFLAG_LOCKED, + &log_redirect, 0, ""); + +static int icmp_datalen = 8; #if ICMP_BANDLIM @@ -192,19 +199,19 @@ icmp_error( struct mbuf *n, int type, int code, - n_long dest, + u_int32_t dest, u_int32_t nextmtu) { - struct ip *oip = mtod(n, struct ip *), *nip; - unsigned oiplen; + struct ip *oip, *nip; struct icmp *icp; struct mbuf *m; - unsigned icmplen; + u_int32_t oiphlen, icmplen, icmpelen, nlen; /* Expect 32-bit aligned data pointer on strict-align platforms */ MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(n); - oiplen = IP_VHL_HL(oip->ip_vhl) << 2; + oip = mtod(n, struct ip *); + oiphlen = IP_VHL_HL(oip->ip_vhl) << 2; #if ICMPPRINTFS if (icmpprintfs) @@ -218,44 +225,92 @@ icmp_error( * Don't error if the old packet protocol was ICMP * error message, only known informational types. */ - if (oip->ip_off &~ (IP_MF|IP_DF)) + if (oip->ip_off & ~(IP_MF|IP_DF)) goto freeit; + if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && - n->m_len >= oiplen + ICMP_MINLEN && - !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiplen))-> + n->m_len >= oiphlen + ICMP_MINLEN && + !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiphlen))-> icmp_type)) { icmpstat.icps_oldicmp++; goto freeit; } - /* Don't send error in response to a multicast or broadcast packet */ + /* + * Don't send error in response to a multicast or + * broadcast packet + */ if (n->m_flags & (M_BCAST|M_MCAST)) goto freeit; + + /* + * Calculate the length to quote from original packet and prevent + * the ICMP mbuf from overflowing. + */ + nlen = m_length(n); + if (oip->ip_p == IPPROTO_TCP) { + struct tcphdr *th; + u_int16_t tcphlen; + + if (oiphlen + sizeof(struct tcphdr) > n->m_len && + n->m_next == NULL) + goto stdreply; + if (n->m_len < (oiphlen + sizeof(struct tcphdr)) && + (n = m_pullup(n, (oiphlen + sizeof(struct tcphdr)))) == NULL) + goto freeit; + + th = (struct tcphdr *)(void *)((caddr_t)oip + oiphlen); + if (th != ((struct tcphdr *)P2ROUNDDOWN(th, + sizeof(u_int32_t)))) + goto freeit; + tcphlen = th->th_off << 2; + if (tcphlen < sizeof(struct tcphdr)) + goto freeit; + if (oip->ip_len < (oiphlen + tcphlen)) + goto freeit; + if ((oiphlen + tcphlen) > n->m_len && n->m_next == NULL) + goto stdreply; + if (n->m_len < (oiphlen + tcphlen) && + (n = m_pullup(n, (oiphlen + tcphlen))) == NULL) + goto freeit; + + icmpelen = max(tcphlen, min(icmp_datalen, + (oip->ip_len - oiphlen))); + } else +stdreply: icmpelen = max(ICMP_MINLEN, min(icmp_datalen, + (ntohs(oip->ip_len) - oiphlen))); + + icmplen = min(oiphlen + icmpelen, min(nlen, oip->ip_len)); + if (icmplen < sizeof(struct ip)) + goto freeit; /* * First, formulate icmp message */ - m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */ + if (MHLEN > (sizeof(struct ip) + ICMP_MINLEN + icmplen)) + m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */ + else + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) goto freeit; - if (n->m_flags & M_SKIP_FIREWALL) { - /* set M_SKIP_FIREWALL to skip firewall check, since we're called from firewall */ + if (n->m_flags & M_SKIP_FIREWALL) { + /* + * set M_SKIP_FIREWALL to skip firewall check, since + * we're called from firewall + */ m->m_flags |= M_SKIP_FIREWALL; } #if CONFIG_MACF_NET mac_mbuf_label_associate_netlayer(n, m); #endif - icmplen = min(oiplen + 8, oip->ip_len); - if (icmplen < sizeof(struct ip)) { - printf("icmp_error: bad length\n"); - m_free(m); - goto freeit; - } - m->m_len = icmplen + ICMP_MINLEN; + m->m_len = icmplen + ICMP_MINLEN; /* for ICMP header and data */ MH_ALIGN(m, m->m_len); icp = mtod(m, struct icmp *); - if ((u_int)type > ICMP_MAXTYPE) - panic("icmp_error"); + if ((u_int)type > ICMP_MAXTYPE) { + m_freem(m); + goto freeit; + } icmpstat.icps_outhist[type]++; icp->icmp_type = type; if (type == ICMP_REDIRECT) @@ -290,8 +345,10 @@ icmp_error( * Now, copy old ip header (without options) * in front of icmp message. */ - if (m->m_data - sizeof(struct ip) < m->m_pktdat) - panic("icmp len"); + if (m->m_data - sizeof(struct ip) < m->m_pktdat) { + m_freem(m); + goto freeit; + } m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); m->m_pkthdr.len = m->m_len; @@ -302,6 +359,7 @@ icmp_error( nip->ip_vhl = IP_VHL_BORING; nip->ip_p = IPPROTO_ICMP; nip->ip_tos = 0; + nip->ip_off = 0; icmp_reflect(m); freeit: @@ -856,7 +914,7 @@ icmp_send(struct mbuf *m, struct mbuf *opts) ROUTE_RELEASE(&ro); } -n_time +u_int32_t iptime(void) { struct timeval atv; diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 9dde8f80e..bc1bb0f2f 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -2041,7 +2041,7 @@ ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop) struct in_ifaddr *ia; int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; struct in_addr *sin, dst; - n_time ntime; + u_int32_t ntime; struct sockaddr_in ipaddr = { sizeof (ipaddr), AF_INET, 0, { 0 }, { 0, } }; @@ -2305,8 +2305,6 @@ nosourcerouting: } return (0); bad: - /* XXX icmp_error adds in hdr length */ - ip->ip_len -= IP_VHL_HL(ip->ip_vhl) << 2; icmp_error(m, type, code, 0, 0); OSAddAtomic(1, &ipstat.ips_badoptions); return (1); diff --git a/bsd/netinet/tcp_cubic.c b/bsd/netinet/tcp_cubic.c index 7e2d00b07..2eb86f1a9 100644 --- a/bsd/netinet/tcp_cubic.c +++ b/bsd/netinet/tcp_cubic.c @@ -161,8 +161,10 @@ tcp_cubic_update(struct tcpcb *tp, u_int32_t rtt) float K, var; u_int32_t elapsed_time, win; - VERIFY(tp->t_ccstate->cub_last_max > 0); win = min(tp->snd_cwnd, tp->snd_wnd); + if (tp->t_ccstate->cub_last_max == 0) + tp->t_ccstate->cub_last_max = tp->snd_ssthresh; + if (tp->t_ccstate->cub_epoch_start == 0) { /* * This is the beginning of a new epoch, initialize some of diff --git a/bsd/netinet/tcp_debug.h b/bsd/netinet/tcp_debug.h index 0cfb8d953..d7a7130a5 100644 --- a/bsd/netinet/tcp_debug.h +++ b/bsd/netinet/tcp_debug.h @@ -67,7 +67,7 @@ #ifdef PRIVATE struct tcp_debug { - n_time td_time; + u_int32_t td_time; short td_act; short td_ostate; caddr_t td_tcb; diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c index ee974e44e..53e362c89 100644 --- a/bsd/netinet6/in6_proto.c +++ b/bsd/netinet6/in6_proto.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2013 Apple Inc. All rights reserved. + * Copyright (c) 2008-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -476,7 +476,7 @@ u_int32_t rip6_sendspace = RIPV6SNDQ; u_int32_t rip6_recvspace = RIPV6RCVQ; /* ICMPV6 parameters */ -int icmp6_rediraccept = 1; /* accept and process redirects */ +int icmp6_rediraccept = 0; /* accept and process redirects */ int icmp6_redirtimeout = 10 * 60; /* 10 minutes */ int icmp6errppslim = 500; /* 500 packets per second */ int icmp6rappslim = 10; /* 10 packets per second */ diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c index 266547020..4cc199ca8 100644 --- a/bsd/netinet6/ip6_input.c +++ b/bsd/netinet6/ip6_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2014 Apple Inc. All rights reserved. + * Copyright (c) 2003-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -605,27 +605,15 @@ ip6_input(struct mbuf *m) } ip6stat.ip6s_nxthist[ip6->ip6_nxt]++; - -#if IPFW2 - /* - * Check with the firewall... - */ - if (ip6_fw_enable && ip6_fw_chk_ptr) { - u_short port = 0; - /* If ipfw says divert, we have to just drop packet */ - /* use port as a dummy argument */ - if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) { - m_freem(m); - m = NULL; - } - if (!m) - goto done; - } -#endif /* IPFW2 */ - /* * Check against address spoofing/corruption. */ + if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP) && + IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src)) { + ip6stat.ip6s_badscope++; + in6_ifstat_inc(inifp, ifs6_in_addrerr); + goto bad; + } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { /* @@ -681,6 +669,22 @@ ip6_input(struct mbuf *m) goto bad; } #endif +#if IPFW2 + /* + * Check with the firewall... + */ + if (ip6_fw_enable && ip6_fw_chk_ptr) { + u_short port = 0; + /* If ipfw says divert, we have to just drop packet */ + /* use port as a dummy argument */ + if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) { + m_freem(m); + m = NULL; + } + if (!m) + goto done; + } +#endif /* IPFW2 */ /* * Naively assume we can attribute inbound data to the route we would diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index a58e5d866..c0715a2b0 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -3797,7 +3797,7 @@ again: } } - if (req->r_achain.tqe_next == NFSREQNOLIST) + if (req->r_achain.tqe_next == NFSREQNOLIST || req->r_achain.tqe_next == NFSIODCOMPLETING) TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain); /* If this mount doesn't already have an nfsiod working on it... */ diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index ad7d5a271..49d487f53 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -4302,8 +4302,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) nfs4_mount_callback_shutdown(nmp); /* Destroy any RPCSEC_GSS contexts */ - if (!TAILQ_EMPTY(&nmp->nm_gsscl)) - nfs_gss_clnt_ctx_unmount(nmp); + nfs_gss_clnt_ctx_unmount(nmp); /* mark the socket for termination */ lck_mtx_lock(&nmp->nm_lock); diff --git a/bsd/sys/Makefile b/bsd/sys/Makefile index 880af7e3d..30a5166b8 100644 --- a/bsd/sys/Makefile +++ b/bsd/sys/Makefile @@ -135,7 +135,7 @@ EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} linker_set.h bsdtask_info vnode_internal.h proc_internal.h file_internal.h mount_internal.h \ uio_internal.h tree.h munge.h kern_tests.h -EXPORT_MI_GEN_LIST = syscall.h sysproto.h +EXPORT_MI_GEN_LIST = syscall.h sysproto.h kdebugevents.h EXPORT_MI_DIR = sys @@ -150,9 +150,10 @@ INSTALL_KF_MI_LCL_GEN_LIST = sysproto.h # /System/Library/Frameworks/Kernel.framework/Headers INSTALL_KF_MI_LIST = ${KERNELFILES} -INSTALL_KF_MI_GEN_LIST = +INSTALL_KF_MI_GEN_LIST = MAKESYSCALLS = $(SRCROOT)/bsd/kern/makesyscalls.sh +MAKEKDEBUGEVENTS = $(SRCROOT)/bsd/kern/makekdebugevents.py $(OBJROOT)/cscope.genhdrs: $(_v)mkdir -p $(OBJROOT)/cscope.genhdrs @@ -167,6 +168,11 @@ sysproto.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) $(OBJROOT)/cscop @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path $(_v)$(MAKESYSCALLS) $< proto > /dev/null +kdebugevents.h: $(SRCROOT)/bsd/kern/trace.codes $(MAKEKDEBUGEVENTS) $(OBJROOT)/cscope.genhdrs + @echo "Generating bsd/kern/$@ from $<"; + @echo "$(OBJPATH)/bsd/kern/$@" > $(OBJROOT)/cscope.genhdrs/$@.path + $(_v)$(MAKEKDEBUGEVENTS) $< > "$(OBJPATH)/bsd/sys/$@" + MAKE_POSIX_AVAILABILITY = $(SRCROOT)/bsd/sys/make_posix_availability.sh _posix_availability.h: $(MAKE_POSIX_AVAILABILITY) @echo "Generating bsd/sys/$@" diff --git a/bsd/sys/dtrace.h b/bsd/sys/dtrace.h index fa41389d4..3e39fca6a 100644 --- a/bsd/sys/dtrace.h +++ b/bsd/sys/dtrace.h @@ -26,6 +26,8 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Portions Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DTRACE_H @@ -103,6 +105,7 @@ extern "C" { #define S_ROUND(x, a) ((x) + (((a) ? (a) : 1) - 1) & ~(((a) ? (a) : 1) - 1)) #define P2ROUNDUP(x, align) (-(-(x) & -(align))) +#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) #define CTF_MODEL_ILP32 1 /* object data model is ILP32 */ #define CTF_MODEL_LP64 2 /* object data model is LP64 */ @@ -1046,10 +1049,10 @@ typedef struct dtrace_ecbdesc { * DTrace Metadata Description Structures * * DTrace separates the trace data stream from the metadata stream. The only - * metadata tokens placed in the data stream are enabled probe identifiers - * (EPIDs) or (in the case of aggregations) aggregation identifiers. In order - * to determine the structure of the data, DTrace consumers pass the token to - * the kernel, and receive in return a corresponding description of the enabled + * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID + + * timestamp) or (in the case of aggregations) aggregation identifiers. To + * determine the structure of the data, DTrace consumers pass the token to the + * kernel, and receive in return a corresponding description of the enabled * probe (via the dtrace_eprobedesc structure) or the aggregation (via the * dtrace_aggdesc structure). Both of these structures are expressed in terms * of record descriptions (via the dtrace_recdesc structure) that describe the @@ -1147,11 +1150,12 @@ typedef struct dtrace_fmtdesc { #define DTRACEOPT_AGGHIST 27 /* histogram aggregation output */ #define DTRACEOPT_AGGPACK 28 /* packed aggregation output */ #define DTRACEOPT_AGGZOOM 29 /* zoomed aggregation scaling */ +#define DTRACEOPT_TEMPORAL 30 /* temporally ordered output */ #if !defined(__APPLE__) -#define DTRACEOPT_MAX 30 /* number of options */ -#else -#define DTRACEOPT_STACKSYMBOLS 30 /* clear to prevent stack symbolication */ #define DTRACEOPT_MAX 31 /* number of options */ +#else +#define DTRACEOPT_STACKSYMBOLS 31 /* clear to prevent stack symbolication */ +#define DTRACEOPT_MAX 32 /* number of options */ #endif /* __APPLE__ */ #define DTRACEOPT_UNSET (dtrace_optval_t)-2 /* unset option */ @@ -1172,7 +1176,9 @@ typedef struct dtrace_fmtdesc { * where user-level wishes the kernel to snapshot the buffer to (the * dtbd_data field). The kernel uses the same structure to pass back some * information regarding the buffer: the size of data actually copied out, the - * number of drops, the number of errors, and the offset of the oldest record. + * number of drops, the number of errors, the offset of the oldest record, + * and the time of the snapshot. + * * If the buffer policy is a "switch" policy, taking a snapshot of the * principal buffer has the additional effect of switching the active and * inactive buffers. Taking a snapshot of the aggregation buffer _always_ has @@ -1185,8 +1191,29 @@ typedef struct dtrace_bufdesc { uint64_t dtbd_drops; /* number of drops */ DTRACE_PTR(char, dtbd_data); /* data */ uint64_t dtbd_oldest; /* offset of oldest record */ + uint64_t dtbd_timestamp; /* hrtime of snapshot */ } dtrace_bufdesc_t; +/* + * Each record in the buffer (dtbd_data) begins with a header that includes + * the epid and a timestamp. The timestamp is split into two 4-byte parts + * so that we do not require 8-byte alignment. + */ +typedef struct dtrace_rechdr { + dtrace_epid_t dtrh_epid; /* enabled probe id */ + uint32_t dtrh_timestamp_hi; /* high bits of hrtime_t */ + uint32_t dtrh_timestamp_lo; /* low bits of hrtime_t */ +} dtrace_rechdr_t; + +#define DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) \ + ((dtrh)->dtrh_timestamp_lo + \ + ((uint64_t)(dtrh)->dtrh_timestamp_hi << 32)) + +#define DTRACE_RECORD_STORE_TIMESTAMP(dtrh, hrtime) { \ + (dtrh)->dtrh_timestamp_lo = (uint32_t)hrtime; \ + (dtrh)->dtrh_timestamp_hi = hrtime >> 32; \ +} + /* * DTrace Status * diff --git a/bsd/sys/dtrace_impl.h b/bsd/sys/dtrace_impl.h index 71dc020f2..cbb14c0ab 100644 --- a/bsd/sys/dtrace_impl.h +++ b/bsd/sys/dtrace_impl.h @@ -22,6 +22,8 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Portions Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DTRACE_IMPL_H @@ -202,15 +204,18 @@ typedef struct dtrace_hash { * predicate is non-NULL, the DIF object is executed. If the result is * non-zero, the action list is processed, with each action being executed * accordingly. When the action list has been completely executed, processing - * advances to the next ECB. processing advances to the next ECB. If the - * result is non-zero; For each ECB, it first determines the The ECB - * abstraction allows disjoint consumers to multiplex on single probes. + * advances to the next ECB. The ECB abstraction allows disjoint consumers + * to multiplex on single probes. + * + * Execution of the ECB results in consuming dte_size bytes in the buffer + * to record data. During execution, dte_needed bytes must be available in + * the buffer. This space is used for both recorded data and tuple data. */ struct dtrace_ecb { dtrace_epid_t dte_epid; /* enabled probe ID */ uint32_t dte_alignment; /* required alignment */ - size_t dte_needed; /* bytes needed */ - size_t dte_size; /* total size of payload */ + size_t dte_needed; /* space needed for execution */ + size_t dte_size; /* size of recorded payload */ dtrace_predicate_t *dte_predicate; /* predicate, if any */ dtrace_action_t *dte_action; /* actions, if any */ dtrace_ecb_t *dte_next; /* next ECB on probe */ @@ -268,27 +273,30 @@ typedef struct dtrace_aggregation { * the EPID, the consumer can determine the data layout. (The data buffer * layout is shown schematically below.) By assuring that one can determine * data layout from the EPID, the metadata stream can be separated from the - * data stream -- simplifying the data stream enormously. - * - * base of data buffer ---> +------+--------------------+------+ - * | EPID | data | EPID | - * +------+--------+------+----+------+ - * | data | EPID | data | - * +---------------+------+-----------+ - * | data, cont. | - * +------+--------------------+------+ - * | EPID | data | | - * +------+--------------------+ | - * | || | - * | || | - * | \/ | - * : : - * . . - * . . - * . . - * : : - * | | - * limit of data buffer ---> +----------------------------------+ + * data stream -- simplifying the data stream enormously. The ECB always + * proceeds the recorded data as part of the dtrace_rechdr_t structure that + * includes the EPID and a high-resolution timestamp used for output ordering + * consistency. + * + * base of data buffer ---> +--------+--------------------+--------+ + * | rechdr | data | rechdr | + * +--------+------+--------+----+--------+ + * | data | rechdr | data | + * +---------------+--------+-------------+ + * | data, cont. | + * +--------+--------------------+--------+ + * | rechdr | data | | + * +--------+--------------------+ | + * | || | + * | || | + * | \/ | + * : : + * . . + * . . + * . . + * : : + * | | + * limit of data buffer ---> +--------------------------------------+ * * When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the * principal buffer (both scratch and payload) exceed the available space. If @@ -426,6 +434,8 @@ typedef struct dtrace_buffer { #ifndef _LP64 uint64_t dtb_pad1; #endif + uint64_t dtb_switched; /* time of last switch */ + uint64_t dtb_interval; /* observed switch interval */ } dtrace_buffer_t; /* diff --git a/bsd/sys/event.h b/bsd/sys/event.h index 66efc61b0..44cef5438 100644 --- a/bsd/sys/event.h +++ b/bsd/sys/event.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2012 Apple Inc. All rights reserved. + * Copyright (c) 2003-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -177,14 +177,20 @@ struct kevent64_s { * (which always returns true for regular files - regardless of the amount * of unread data in the file). * - * On input, EV_OOBAND specifies that only OOB data should be looked for. - * The returned data count is the number of bytes beyond the current OOB marker. + * On input, EV_OOBAND specifies that filter should actively return in the + * presence of OOB on the descriptor. It implies that filter will return + * if there is OOB data available to read OR when any other condition + * for the read are met (for example number of bytes regular data becomes >= + * low-watermark). + * If EV_OOBAND is not set on input, it implies that the filter should not actively + * return for out of band data on the descriptor. The filter will then only return + * when some other condition for read is met (ex: when number of regular data bytes + * >=low-watermark OR when socket can't receive more data (SS_CANTRCVMORE)). * - * On output, EV_OOBAND indicates that OOB data is present + * On output, EV_OOBAND indicates the presence of OOB data on the descriptor. * If it was not specified as an input parameter, then the data count is the - * number of bytes before the current OOB marker. If at the marker, the - * data count indicates the number of bytes available after it. In either - * case, it's the amount of data one could expect to receive next. + * number of bytes before the current OOB marker, else data count is the number + * of bytes beyond OOB marker. */ #define EV_POLL EV_FLAG0 #define EV_OOBAND EV_FLAG1 @@ -474,7 +480,7 @@ extern int knote_link_wait_queue(struct knote *kn, struct wait_queue *wq, wait_q extern int knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t *wqlp); extern void knote_fdclose(struct proc *p, int fd); extern void knote_markstayqueued(struct knote *kn); - +extern void knote_clearstayqueued(struct knote *kn); #endif /* !KERNEL_PRIVATE */ #else /* KERNEL */ diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index 186ace8f3..af75e23a1 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -291,15 +291,15 @@ extern int kdebug_trace(uint32_t code, uint64_t arg1, uint64_t arg2, uint64_t ar /* Codes for Selective Forced Idle (DBG_MACH_SFI) */ #define SFI_SET_WINDOW 0x0 #define SFI_CANCEL_WINDOW 0x1 -#define SFI_SET_CLASS_OFFTIME 0x2 +#define SFI_SET_CLASS_OFFTIME 0x2 #define SFI_CANCEL_CLASS_OFFTIME 0x3 #define SFI_THREAD_DEFER 0x4 #define SFI_OFF_TIMER 0x5 #define SFI_ON_TIMER 0x6 #define SFI_WAIT_CANCELED 0x7 #define SFI_PID_SET_MANAGED 0x8 -#define SFI_PID_CLEAR_MANAGED 0x9 - +#define SFI_PID_CLEAR_MANAGED 0x9 +#define SFI_GLOBAL_DEFER 0xa /* **** The Kernel Debug Sub Classes for Network (DBG_NETWORK) **** */ #define DBG_NETIP 1 /* Internet Protocol */ #define DBG_NETARP 2 /* Address Resolution Protocol */ @@ -462,11 +462,17 @@ extern int kdebug_trace(uint32_t code, uint64_t arg1, uint64_t arg2, uint64_t ar #define DBG_TRACE_STRING 1 #define DBG_TRACE_INFO 2 -/* - * TRACE_DATA_NEWTHREAD 0x1 - * TRACE_DATA_EXEC 0x2 - */ -#define TRACE_DATA_THREAD_TERMINATE 0x3 /* thread has been queued for deallocation and can no longer run */ +/* The Kernel Debug events: */ +#define TRACE_DATA_NEWTHREAD (TRACEDBG_CODE(DBG_TRACE_DATA, 1)) +#define TRACE_DATA_EXEC (TRACEDBG_CODE(DBG_TRACE_DATA, 2)) +#define TRACE_DATA_THREAD_TERMINATE (TRACEDBG_CODE(DBG_TRACE_DATA, 3)) +#define TRACE_STRING_NEWTHREAD (TRACEDBG_CODE(DBG_TRACE_STRING, 1)) +#define TRACE_STRING_EXEC (TRACEDBG_CODE(DBG_TRACE_STRING, 2)) +#define TRACE_PANIC (TRACEDBG_CODE(DBG_TRACE_INFO, 0)) +#define TRACE_TIMESTAMPS (TRACEDBG_CODE(DBG_TRACE_INFO, 1)) +#define TRACE_LOST_EVENTS (TRACEDBG_CODE(DBG_TRACE_INFO, 2)) +#define TRACE_WRITING_EVENTS (TRACEDBG_CODE(DBG_TRACE_INFO, 3)) +#define TRACE_INFO_STRING (TRACEDBG_CODE(DBG_TRACE_INFO, 4)) /* The Kernel Debug Sub Classes for DBG_CORESTORAGE */ #define DBG_CS_IO 0 @@ -638,6 +644,7 @@ extern unsigned int kdebug_enable; #define KDEBUG_ENABLE_ENTROPY 0x2 /* Obsolescent */ #define KDEBUG_ENABLE_CHUD 0x4 #define KDEBUG_ENABLE_PPT 0x8 +#define KDEBUG_ENABLE_SERIAL 0x10 /* * Infer the supported kernel debug event level from config option. @@ -1053,6 +1060,14 @@ typedef struct { /* Minimum value allowed when setting decrementer ticks */ #define KDBG_MINRTCDEC 2500 +/* VFS lookup events for serial traces */ +#define VFS_LOOKUP (FSDBG_CODE(DBG_FSRW,36)) +#define VFS_LOOKUP_DONE (FSDBG_CODE(DBG_FSRW,39)) + +#if (DEVELOPMENT || DEBUG) +#define KDEBUG_MOJO_TRACE 1 +#endif + #endif /* __APPLE_API_PRIVATE */ #endif /* PRIVATE */ diff --git a/bsd/vfs/vfs_fsevents.c b/bsd/vfs/vfs_fsevents.c index fd3636132..6a90031c6 100644 --- a/bsd/vfs/vfs_fsevents.c +++ b/bsd/vfs/vfs_fsevents.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2008 Apple Inc. All rights reserved. + * Copyright (c) 2004-2014 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -643,6 +643,7 @@ add_fsevent(int type, vfs_context_t ctx, ...) VATTR_WANTED(&va, va_mode); VATTR_WANTED(&va, va_uid); VATTR_WANTED(&va, va_gid); + VATTR_WANTED(&va, va_nlink); if ((ret = vnode_getattr(vp, &va, vfs_context_kernel())) != 0) { // printf("add_fsevent: failed to getattr on vp %p (%d)\n", cur->fref.vp, ret); cur->str = NULL; @@ -655,6 +656,12 @@ add_fsevent(int type, vfs_context_t ctx, ...) cur->mode = (int32_t)vnode_vttoif(vnode_vtype(vp)) | va.va_mode; cur->uid = va.va_uid; cur->gid = va.va_gid; + if (vp->v_flag & VISHARDLINK) { + cur->mode |= FSE_MODE_HLINK; + if ((vp->v_type == VDIR && va.va_dirlinkcount == 0) || (vp->v_type == VREG && va.va_nlink == 0)) { + cur->mode |= FSE_MODE_LAST_HLINK; + } + } // if we haven't gotten the path yet, get it. if (pathbuff == NULL) { diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index 579542e16..4beff12a6 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -1679,9 +1679,9 @@ kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, boolean_t l * entries, we must mark the start of the path's string and the end. */ if (lookup == TRUE) - code = (FSDBG_CODE(DBG_FSRW,36)) | DBG_FUNC_START; + code = VFS_LOOKUP | DBG_FUNC_START; else - code = (FSDBG_CODE(DBG_FSRW,39)) | DBG_FUNC_START; + code = VFS_LOOKUP_DONE | DBG_FUNC_START; if (dbg_namelen <= (int)(3 * sizeof(long))) code |= DBG_FUNC_END; diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index 521cb5713..ba37a4e38 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -9235,6 +9235,24 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long is64bit = proc_is64bit(p); memp = NULL; + + /* + * ensure the buffer is large enough for underlying calls + */ +#ifndef HFSIOC_GETPATH +typedef char pn_t[MAXPATHLEN]; +#define HFSIOC_GETPATH _IOWR('h', 13, pn_t) +#endif + +#ifndef HFS_GETPATH +#define HFS_GETPATH IOCBASECMD(HFSIOC_GETPATH) +#endif + if (IOCBASECMD(cmd) == HFS_GETPATH) { + /* Round up to MAXPATHLEN regardless of user input */ + size = MAXPATHLEN; + } + + if (size > sizeof (stkbuf)) { if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM; data = memp; diff --git a/config/MasterVersion b/config/MasterVersion index 1e84dc0e9..747e47ed9 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -14.1.0 +14.3.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/Private.exports b/config/Private.exports index 173309543..d7995f81e 100644 --- a/config/Private.exports +++ b/config/Private.exports @@ -24,10 +24,20 @@ _buf_create_shadow _buf_kernel_addrperm_addr _buf_setfilter _buf_shadow +_bufattr_alloc +_bufattr_dup +_bufattr_free +_bufattr_greedymode +_bufattr_isochronous +_bufattr_markgreedymode +_bufattr_markisochronous +_bufattr_markmeta +_bufattr_markquickcomplete _bufattr_meta _bufattr_nocache -_bufattr_throttled _bufattr_passive +_bufattr_quickcomplete +_bufattr_throttled _cdevsw _cdevsw_setkqueueok _chudxnu_platform_ptr diff --git a/libsyscall/mach/.gitignore b/libsyscall/mach/.gitignore new file mode 100644 index 000000000..f718d68d2 --- /dev/null +++ b/libsyscall/mach/.gitignore @@ -0,0 +1,3 @@ +*.pbxuser +*.perspectivev3 +build/ diff --git a/osfmk/atm/atm.c b/osfmk/atm/atm.c index f1ba71062..92e9547bf 100644 --- a/osfmk/atm/atm.c +++ b/osfmk/atm/atm.c @@ -337,6 +337,9 @@ atm_get_value( if (kr != KERN_SUCCESS) { break; } + } else { + kr = KERN_INVALID_TASK; + break; } /* Increment sync value. */ @@ -939,8 +942,8 @@ atm_listener_insert( */ next->mailbox = mailbox; lck_mtx_unlock(&atm_value->listener_lock); - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_REPLACED))) | DBG_FUNC_NONE, - atm_value, atm_value->aid, mailbox_offset, 0, 0); + KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_REPLACED))) | DBG_FUNC_NONE, + VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0); /* Drop the extra reference on task descriptor taken by this function. */ atm_task_descriptor_dealloc(task_descriptor); @@ -948,8 +951,8 @@ atm_listener_insert( return KERN_SUCCESS; } } - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_ADDED))) | DBG_FUNC_NONE, - atm_value, atm_value->aid, mailbox_offset, 0, 0); + KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_ADDED))) | DBG_FUNC_NONE, + VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0); queue_enter(&atm_value->listeners, new_link_object, atm_link_object_t, listeners_element); atm_value->listener_count++; @@ -1006,18 +1009,18 @@ atm_listener_delete( if (elem->descriptor == task_descriptor) { if (elem->mailbox == mailbox) { - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_UNREGISTER_INFO, + KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO, (ATM_VALUE_UNREGISTERED))) | DBG_FUNC_NONE, - atm_value, atm_value->aid, mailbox_offset, 0, 0); + VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0); queue_remove(&atm_value->listeners, elem, atm_link_object_t, listeners_element); queue_enter(&free_listeners, elem, atm_link_object_t, listeners_element); atm_value->listener_count--; kr = KERN_SUCCESS; break; } else { - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_UNREGISTER_INFO, + KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO, (ATM_VALUE_DIFF_MAILBOX))) | DBG_FUNC_NONE, - atm_value, atm_value->aid, 0, 0, 0); + VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, 0, 0, 0); kr = KERN_INVALID_VALUE; break; } @@ -1255,7 +1258,7 @@ atm_get_min_sub_aid_array( atm_value_t atm_value; uint32_t i; - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_START, + KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_START, 0, 0, 0, 0, 0); for (i = 0; i < count; i++) { @@ -1268,7 +1271,7 @@ atm_get_min_sub_aid_array( atm_value_dealloc(atm_value); } - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_END, + KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_END, count, 0, 0, 0, 0); } @@ -1292,7 +1295,7 @@ atm_get_min_sub_aid(atm_value_t atm_value) atm_link_object_t next, elem; queue_head_t free_listeners; - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_START, + KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_START, 0, 0, 0, 0, 0); lck_mtx_lock(&atm_value->listener_lock); @@ -1385,7 +1388,7 @@ atm_get_min_sub_aid(atm_value_t atm_value) atm_link_dealloc(next); } - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_END, + KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_END, j, freed_count, dead_but_not_freed, 0, 0); /* explicitly upgrade uint32_t to 64 bit mach size */ diff --git a/osfmk/device/device_init.c b/osfmk/device/device_init.c index b935e14c6..8b00349eb 100644 --- a/osfmk/device/device_init.c +++ b/osfmk/device/device_init.c @@ -80,6 +80,7 @@ #include ipc_port_t master_device_port; +void *master_device_kobject; lck_grp_attr_t * dev_lck_grp_attr; lck_grp_t * dev_lck_grp; @@ -93,8 +94,8 @@ device_service_create(void) if (master_device_port == IP_NULL) panic("can't allocate master device port"); - ipc_kobject_set(master_device_port, 1, IKOT_MASTER_DEVICE); - kernel_set_special_port(host_priv_self(), HOST_IO_MASTER_PORT, + ipc_kobject_set(master_device_port, (ipc_kobject_t)&master_device_kobject, IKOT_MASTER_DEVICE); + kernel_set_special_port(host_priv_self(), HOST_IO_MASTER_PORT, ipc_port_make_send(master_device_port)); /* allocate device lock group attribute and group */ diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index fb9157b88..35d4bf279 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include #include @@ -103,6 +104,7 @@ #include /* inb() */ #include +#include #include #include #include @@ -127,11 +129,23 @@ #endif static void machine_conf(void); +void panic_print_symbol_name(vm_address_t search); +extern boolean_t init_task_died; +extern const char version[]; +extern char osversion[]; extern int max_unsafe_quanta; extern int max_poll_quanta; extern unsigned int panic_is_inited; +extern int proc_pid(void *p); + +/* Definitions for frame pointers */ +#define FP_ALIGNMENT_MASK ((uint32_t)(0x3)) +#define FP_LR_OFFSET ((uint32_t)4) +#define FP_LR_OFFSET64 ((uint32_t)8) +#define FP_MAX_NUM_TO_EVALUATE (50) + int db_run_mode; volatile int pbtcpu = -1; @@ -155,6 +169,93 @@ static unsigned commit_paniclog_to_nvram; unsigned int debug_boot_arg; +/* + * Backtrace a single frame. + */ +void +print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker, + boolean_t is_64_bit, boolean_t nvram_format) +{ + int i = 0; + addr64_t lr; + addr64_t fp; + addr64_t fp_for_ppn; + ppnum_t ppn; + boolean_t dump_kernel_stack; + + fp = topfp; + fp_for_ppn = 0; + ppn = (ppnum_t)NULL; + + if (fp >= VM_MIN_KERNEL_ADDRESS) + dump_kernel_stack = TRUE; + else + dump_kernel_stack = FALSE; + + do { + if ((fp == 0) || ((fp & FP_ALIGNMENT_MASK) != 0)) + break; + if (dump_kernel_stack && ((fp < VM_MIN_KERNEL_ADDRESS) || (fp > VM_MAX_KERNEL_ADDRESS))) + break; + if ((!dump_kernel_stack) && (fp >=VM_MIN_KERNEL_ADDRESS)) + break; + + /* Check to see if current address will result in a different + ppn than previously computed (to avoid recomputation) via + (addr) ^ fp_for_ppn) >> PAGE_SHIFT) */ + + if ((((fp + FP_LR_OFFSET) ^ fp_for_ppn) >> PAGE_SHIFT) != 0x0U) { + ppn = pmap_find_phys(pmap, fp + FP_LR_OFFSET); + fp_for_ppn = fp + (is_64_bit ? FP_LR_OFFSET64 : FP_LR_OFFSET); + } + if (ppn != (ppnum_t)NULL) { + if (is_64_bit) { + lr = ml_phys_read_double_64(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET64) & PAGE_MASK)); + } else { + lr = ml_phys_read_word(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET) & PAGE_MASK)); + } + } else { + if (is_64_bit) { + kdb_printf("%s\t Could not read LR from frame at 0x%016llx\n", cur_marker, fp + FP_LR_OFFSET64); + } else { + kdb_printf("%s\t Could not read LR from frame at 0x%08x\n", cur_marker, (uint32_t)(fp + FP_LR_OFFSET)); + } + break; + } + if (((fp ^ fp_for_ppn) >> PAGE_SHIFT) != 0x0U) { + ppn = pmap_find_phys(pmap, fp); + fp_for_ppn = fp; + } + if (ppn != (ppnum_t)NULL) { + if (is_64_bit) { + fp = ml_phys_read_double_64(((((vm_offset_t)ppn) << PAGE_SHIFT)) | (fp & PAGE_MASK)); + } else { + fp = ml_phys_read_word(((((vm_offset_t)ppn) << PAGE_SHIFT)) | (fp & PAGE_MASK)); + } + } else { + if (is_64_bit) { + kdb_printf("%s\t Could not read FP from frame at 0x%016llx\n", cur_marker, fp); + } else { + kdb_printf("%s\t Could not read FP from frame at 0x%08x\n", cur_marker, (uint32_t)fp); + } + break; + } + + if (nvram_format) { + if (is_64_bit) { + kdb_printf("%s\t0x%016llx\n", cur_marker, lr); + } else { + kdb_printf("%s\t0x%08x\n", cur_marker, (uint32_t)lr); + } + } else { + if (is_64_bit) { + kdb_printf("%s\t lr: 0x%016llx fp: 0x%016llx\n", cur_marker, lr, fp); + } else { + kdb_printf("%s\t lr: 0x%08x fp: 0x%08x\n", cur_marker, (uint32_t)lr, (uint32_t)fp); + } + } + } while ((++i < FP_MAX_NUM_TO_EVALUATE) && (fp != topfp)); +} void machine_startup(void) { @@ -171,6 +272,12 @@ machine_startup(void) if (debug_boot_arg & DB_PRT) disable_debug_output=FALSE; if (debug_boot_arg & DB_SLOG) systemLogDiags=TRUE; if (debug_boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE; +#if KDEBUG_MOJO_TRACE + if (debug_boot_arg & DB_PRT_KDEBUG) { + kdebug_serial = TRUE; + disable_debug_output = FALSE; + } +#endif } else { debug_boot_arg = 0; } @@ -757,6 +864,16 @@ machine_halt_cpu(void) { pmCPUHalt(PM_HALT_DEBUG); } +static int pid_from_task(task_t task) +{ + int pid = -1; + + if (task->bsd_info) + pid = proc_pid(task->bsd_info); + + return pid; +} + void DebuggerWithContext( __unused unsigned int reason, @@ -773,6 +890,9 @@ Debugger( unsigned long pi_size = 0; void *stackptr; int cn = cpu_number(); + task_t task = current_task(); + int task_pid = pid_from_task(task); + hw_atomic_add(&debug_mode, 1); if (!panic_is_inited) { @@ -802,7 +922,12 @@ Debugger( __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr)); /* Print backtrace - callee is internally synchronized */ - panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), NULL, FALSE, NULL); + if ((task_pid == 1) && (init_task_died)) { + /* Special handling of launchd died panics */ + print_launchd_info(); + } else { + panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), NULL, FALSE, NULL); + } /* everything should be printed now so copy to NVRAM */ @@ -994,7 +1119,7 @@ panic_print_kmod_symbol_name(vm_address_t search) } } -static void +void panic_print_symbol_name(vm_address_t search) { /* try searching in the kernel */ @@ -1138,3 +1263,184 @@ out: bt_tsc_timeout = rdtsc64() + PBT_TIMEOUT_CYCLES; while(*ppbtcnt && (rdtsc64() < bt_tsc_timeout)); } + +static boolean_t +debug_copyin(pmap_t p, uint64_t uaddr, void *dest, size_t size) +{ + size_t rem = size; + char *kvaddr = dest; + + while (rem) { + ppnum_t upn = pmap_find_phys(p, uaddr); + uint64_t phys_src = ptoa_64(upn) | (uaddr & PAGE_MASK); + uint64_t phys_dest = kvtophys((vm_offset_t)kvaddr); + uint64_t src_rem = PAGE_SIZE - (phys_src & PAGE_MASK); + uint64_t dst_rem = PAGE_SIZE - (phys_dest & PAGE_MASK); + size_t cur_size = (uint32_t) MIN(src_rem, dst_rem); + cur_size = MIN(cur_size, rem); + + if (upn && pmap_valid_page(upn) && phys_dest) { + bcopy_phys(phys_src, phys_dest, cur_size); + } + else + break; + uaddr += cur_size; + kvaddr += cur_size; + rem -= cur_size; + } + return (rem == 0); +} + +void +print_threads_registers(thread_t thread) +{ + x86_saved_state_t *savestate; + + savestate = get_user_regs(thread); + kdb_printf( + "\nRAX: 0x%016llx, RBX: 0x%016llx, RCX: 0x%016llx, RDX: 0x%016llx\n" + "RSP: 0x%016llx, RBP: 0x%016llx, RSI: 0x%016llx, RDI: 0x%016llx\n" + "R8: 0x%016llx, R9: 0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n" + "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n" + "RFL: 0x%016llx, RIP: 0x%016llx, CS: 0x%016llx, SS: 0x%016llx\n\n", + savestate->ss_64.rax, savestate->ss_64.rbx, savestate->ss_64.rcx, savestate->ss_64.rdx, + savestate->ss_64.isf.rsp, savestate->ss_64.rbp, savestate->ss_64.rsi, savestate->ss_64.rdi, + savestate->ss_64.r8, savestate->ss_64.r9, savestate->ss_64.r10, savestate->ss_64.r11, + savestate->ss_64.r12, savestate->ss_64.r13, savestate->ss_64.r14, savestate->ss_64.r15, + savestate->ss_64.isf.rflags, savestate->ss_64.isf.rip, savestate->ss_64.isf.cs, + savestate->ss_64.isf.ss); +} + +void +print_tasks_user_threads(task_t task) +{ + thread_t thread = current_thread(); + x86_saved_state_t *savestate; + pmap_t pmap = 0; + uint64_t rbp; + const char *cur_marker = 0; + int j; + + for (j = 0, thread = (thread_t) queue_first(&task->threads); j < task->thread_count; + ++j, thread = (thread_t) queue_next(&thread->task_threads)) { + + kdb_printf("Thread %p\n", thread); + pmap = get_task_pmap(task); + savestate = get_user_regs(thread); + rbp = savestate->ss_64.rbp; + print_one_backtrace(pmap, (vm_offset_t)rbp, cur_marker, TRUE, TRUE); + kdb_printf("\n"); + } +} + +#define PANICLOG_UUID_BUF_SIZE 256 + +void print_uuid_info(task_t task) +{ + uint32_t uuid_info_count = 0; + mach_vm_address_t uuid_info_addr = 0; + boolean_t have_map = (task->map != NULL) && (ml_validate_nofault((vm_offset_t)(task->map), sizeof(struct _vm_map))); + boolean_t have_pmap = have_map && (task->map->pmap != NULL) && (ml_validate_nofault((vm_offset_t)(task->map->pmap), sizeof(struct pmap))); + int task_pid = pid_from_task(task); + char uuidbuf[PANICLOG_UUID_BUF_SIZE] = {0}; + char *uuidbufptr = uuidbuf; + uint32_t k; + + if (have_pmap && task->active && task_pid > 0) { + /* Read dyld_all_image_infos struct from task memory to get UUID array count & location */ + struct user64_dyld_all_image_infos task_image_infos; + if (debug_copyin(task->map->pmap, task->all_image_info_addr, + &task_image_infos, sizeof(struct user64_dyld_all_image_infos))) { + uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount; + uuid_info_addr = task_image_infos.uuidArray; + } + + /* If we get a NULL uuid_info_addr (which can happen when we catch dyld + * in the middle of updating this data structure), we zero the + * uuid_info_count so that we won't even try to save load info for this task + */ + if (!uuid_info_addr) { + uuid_info_count = 0; + } + } + + if (task_pid > 0 && uuid_info_count > 0) { + uint32_t uuid_info_size = sizeof(struct user64_dyld_uuid_info); + uint32_t uuid_array_size = uuid_info_count * uuid_info_size; + uint32_t uuid_copy_size = 0; + uint32_t uuid_image_count = 0; + char *current_uuid_buffer = NULL; + /* Copy in the UUID info array. It may be nonresident, in which case just fix up nloadinfos to 0 */ + + kdb_printf("\nuuid info:\n"); + while (uuid_array_size) { + if (uuid_array_size <= PANICLOG_UUID_BUF_SIZE) { + uuid_copy_size = uuid_array_size; + uuid_image_count = uuid_array_size/uuid_info_size; + } else { + uuid_image_count = PANICLOG_UUID_BUF_SIZE/uuid_info_size; + uuid_copy_size = uuid_image_count * uuid_info_size; + } + if (have_pmap && !debug_copyin(task->map->pmap, uuid_info_addr, uuidbufptr, + uuid_copy_size)) { + kdb_printf("Error!! Failed to copy UUID info for task %p pid %d\n", task, task_pid); + uuid_image_count = 0; + break; + } + + if (uuid_image_count > 0) { + current_uuid_buffer = uuidbufptr; + for (k = 0; k < uuid_image_count; k++) { + kdb_printf(" %#llx", *(uint64_t *)current_uuid_buffer); + current_uuid_buffer += sizeof(uint64_t); + uint8_t *uuid = (uint8_t *)current_uuid_buffer; + kdb_printf("\tuuid = <%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>\n", + uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], uuid[6], uuid[7], uuid[8], + uuid[9], uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]); + current_uuid_buffer += 16; + } + bzero(&uuidbuf, sizeof(uuidbuf)); + } + uuid_info_addr += uuid_copy_size; + uuid_array_size -= uuid_copy_size; + } + } +} + +void print_launchd_info(void) +{ + task_t task = current_task(); + thread_t thread = current_thread(); + volatile uint32_t *ppbtcnt = &pbtcnt; + uint64_t bt_tsc_timeout; + int cn = cpu_number(); + + if(pbtcpu != cn) { + hw_atomic_add(&pbtcnt, 1); + /* Spin on print backtrace lock, which serializes output + * Continue anyway if a timeout occurs. + */ + hw_lock_to(&pbtlock, ~0U); + pbtcpu = cn; + } + + print_uuid_info(task); + print_threads_registers(thread); + print_tasks_user_threads(task); + kdb_printf("Mac OS version: %s\n", (osversion[0] != 0) ? osversion : "Not yet set"); + kdb_printf("Kernel version: %s\n", version); + panic_display_kernel_uuid(); + panic_display_model_name(); + + /* Release print backtrace lock, to permit other callers in the + * event of panics on multiple processors. + */ + hw_lock_unlock(&pbtlock); + hw_atomic_sub(&pbtcnt, 1); + /* Wait for other processors to complete output + * Timeout and continue after PBT_TIMEOUT_CYCLES. + */ + bt_tsc_timeout = rdtsc64() + PBT_TIMEOUT_CYCLES; + while(*ppbtcnt && (rdtsc64() < bt_tsc_timeout)); + +} diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c index e42d4aef2..76617a0da 100644 --- a/osfmk/i386/acpi.c +++ b/osfmk/i386/acpi.c @@ -35,6 +35,9 @@ #if CONFIG_MTRR #include #endif +#if HYPERVISOR +#include +#endif #if CONFIG_VMX #include #endif @@ -193,6 +196,11 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) /* Save power management timer state */ pmTimerSave(); +#if HYPERVISOR + /* Notify hypervisor that we are about to sleep */ + hv_suspend(); +#endif + #if CONFIG_VMX /* * Turn off VT, otherwise switching to legacy mode will fail diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c index 532c49ee3..9cfd5892b 100644 --- a/osfmk/i386/cpuid.c +++ b/osfmk/i386/cpuid.c @@ -762,12 +762,10 @@ cpuid_set_cpufamily(i386_cpu_info_t *info_p) case CPUID_MODEL_CRYSTALWELL: cpufamily = CPUFAMILY_INTEL_HASWELL; break; -#if !defined(XNU_HIDE_SEED) case CPUID_MODEL_BROADWELL: case CPUID_MODEL_BRYSTALWELL: cpufamily = CPUFAMILY_INTEL_BROADWELL; break; -#endif /* not XNU_HIDE_SEED */ } break; } @@ -944,9 +942,7 @@ leaf7_feature_map[] = { {CPUID_LEAF7_FEATURE_RTM, "RTM"}, {CPUID_LEAF7_FEATURE_RDSEED, "RDSEED"}, {CPUID_LEAF7_FEATURE_ADX, "ADX"}, -#if !defined(XNU_HIDE_SEED) {CPUID_LEAF7_FEATURE_SMAP, "SMAP"}, -#endif /* not XNU_HIDE_SEED */ {0, 0} }; diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h index 1f58d5250..980945d50 100644 --- a/osfmk/i386/cpuid.h +++ b/osfmk/i386/cpuid.h @@ -135,9 +135,7 @@ #define CPUID_LEAF7_FEATURE_RTM _Bit(11) /* RTM */ #define CPUID_LEAF7_FEATURE_RDSEED _Bit(18) /* RDSEED Instruction */ #define CPUID_LEAF7_FEATURE_ADX _Bit(19) /* ADX Instructions */ -#if !defined(XNU_HIDE_SEED) #define CPUID_LEAF7_FEATURE_SMAP _Bit(20) /* Supervisor Mode Access Protect */ -#endif /* not XNU_HIDE_SEED */ /* * The CPUID_EXTFEATURE_XXX values define 64-bit values @@ -203,12 +201,10 @@ #define CPUID_MODEL_HASWELL 0x3C #define CPUID_MODEL_HASWELL_EP 0x3F #define CPUID_MODEL_HASWELL_ULT 0x45 -#if !defined(XNU_HIDE_SEED) #define CPUID_MODEL_BROADWELL 0x3D #define CPUID_MODEL_BROADWELL_ULX 0x3D #define CPUID_MODEL_BROADWELL_ULT 0x3D #define CPUID_MODEL_BRYSTALWELL 0x47 -#endif /* not XNU_HIDE_SEED */ #define CPUID_VMM_FAMILY_UNKNOWN 0x0 #define CPUID_VMM_FAMILY_VMWARE 0x1 diff --git a/osfmk/i386/panic_hooks.c b/osfmk/i386/panic_hooks.c index 113031cfa..e561a6900 100644 --- a/osfmk/i386/panic_hooks.c +++ b/osfmk/i386/panic_hooks.c @@ -167,7 +167,7 @@ void panic_dump_mem(const void *addr, int len) } } -bool panic_phys_range_before(const void *addr, uint64_t *pphys, +boolean_t panic_phys_range_before(const void *addr, uint64_t *pphys, panic_phys_range_t *range) { *pphys = kvtophys((vm_offset_t)addr); diff --git a/osfmk/i386/panic_hooks.h b/osfmk/i386/panic_hooks.h index 92905ebb4..10b38e575 100644 --- a/osfmk/i386/panic_hooks.h +++ b/osfmk/i386/panic_hooks.h @@ -32,7 +32,7 @@ #if XNU_KERNEL_PRIVATE #include -#include +#include typedef struct { uint64_t opaque[6]; @@ -53,7 +53,7 @@ typedef struct panic_phys_range { uint64_t len; } panic_phys_range_t; -bool panic_phys_range_before(const void *addr, uint64_t *pphys, +boolean_t panic_phys_range_before(const void *addr, uint64_t *pphys, panic_phys_range_t *range); #endif // XNU_KERNEL_PRIVATE diff --git a/osfmk/i386/proc_reg.h b/osfmk/i386/proc_reg.h index 2229ab8a8..ba0f1b1e5 100644 --- a/osfmk/i386/proc_reg.h +++ b/osfmk/i386/proc_reg.h @@ -373,6 +373,16 @@ static inline void invlpg(uintptr_t addr) __asm__ volatile("invlpg (%0)" :: "r" (addr) : "memory"); } +static inline void clac(void) +{ + __asm__ volatile("clac"); +} + +static inline void stac(void) +{ + __asm__ volatile("stac"); +} + /* * Access to machine-specific registers (available on 586 and better only) * Note: the rd* operations modify the parameters directly (without using diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c index 0cedaa19d..3a99e32a7 100644 --- a/osfmk/i386/trap.c +++ b/osfmk/i386/trap.c @@ -624,6 +624,17 @@ kernel_trap( goto debugger_entry; } + /* + * Additionally check for SMAP faults... + * which are characterized by page-present and + * the AC bit unset (i.e. not from copyin/out path). + */ + if (__improbable(code & T_PF_PROT && + pmap_smap_enabled && + (saved_state->isf.rflags & EFL_AC) == 0)) { + goto debugger_entry; + } + /* * If we're not sharing cr3 with the user * and we faulted in copyio, @@ -802,6 +813,7 @@ panic_trap(x86_saved_state64_t *regs) const char *trapname = "Unknown"; pal_cr_t cr0, cr2, cr3, cr4; boolean_t potential_smep_fault = FALSE, potential_kernel_NX_fault = FALSE; + boolean_t potential_smap_fault = FALSE; pal_get_control_registers( &cr0, &cr2, &cr3, &cr4 ); assert(ml_get_interrupts_enabled() == FALSE); @@ -826,6 +838,12 @@ panic_trap(x86_saved_state64_t *regs) } else if (regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) { potential_kernel_NX_fault = TRUE; } + } else if (pmap_smap_enabled && + regs->isf.trapno == T_PAGE_FAULT && + regs->isf.err & T_PF_PROT && + regs->cr2 < VM_MAX_USER_PAGE_ADDRESS && + regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) { + potential_smap_fault = TRUE; } #undef panic @@ -848,7 +866,7 @@ panic_trap(x86_saved_state64_t *regs) virtualized ? " VMM" : "", potential_kernel_NX_fault ? " Kernel NX fault" : "", potential_smep_fault ? " SMEP/User NX fault" : "", - ""); + potential_smap_fault ? " SMAP fault" : ""); /* * This next statement is not executed, * but it's needed to stop the compiler using tail call optimization diff --git a/osfmk/i386/trap.h b/osfmk/i386/trap.h index 5eed5e2d1..619f87eaf 100644 --- a/osfmk/i386/trap.h +++ b/osfmk/i386/trap.h @@ -151,6 +151,12 @@ extern volatile perfASTCallback perfASTHook; extern volatile perfCallback perfIntHook; extern void panic_i386_backtrace(void *, int, const char *, boolean_t, x86_saved_state_t *); +extern void print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker, boolean_t is_64_bit, boolean_t nvram_format); +extern void print_tasks_user_threads(task_t task); +extern void print_threads_registers(thread_t thread); +extern void print_uuid_info(task_t task); +extern void print_launchd_info(void); + #if MACH_KDP extern boolean_t kdp_i386_trap( unsigned int, diff --git a/osfmk/ipc/mach_debug.c b/osfmk/ipc/mach_debug.c index 67c303afe..cc0acd912 100644 --- a/osfmk/ipc/mach_debug.c +++ b/osfmk/ipc/mach_debug.c @@ -480,6 +480,7 @@ mach_port_kobject( kaddr = (mach_vm_address_t)port->ip_kobject; ip_unlock(port); + if (0 != kaddr && is_ipc_kobject(*typep)) *addrp = VM_KERNEL_UNSLIDE_OR_PERM(kaddr); else diff --git a/osfmk/kern/bsd_kern.c b/osfmk/kern/bsd_kern.c index 3fc7a40a7..c2edb9069 100644 --- a/osfmk/kern/bsd_kern.c +++ b/osfmk/kern/bsd_kern.c @@ -49,6 +49,7 @@ /* BSD KERN COMPONENT INTERFACE */ task_t bsd_init_task = TASK_NULL; +boolean_t init_task_died; char init_task_failure_data[1024]; extern unsigned int not_in_kdp; /* Skip acquiring locks if we're in kdp */ diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index 6dc10f748..6f527a66f 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -101,6 +101,7 @@ unsigned int disable_debug_output = TRUE; unsigned int systemLogDiags = FALSE; unsigned int panicDebugging = FALSE; unsigned int logPanicDataToScreen = FALSE; +unsigned int kdebug_serial = FALSE; int mach_assert = 1; @@ -497,7 +498,7 @@ void populate_model_name(char *model_string) { strlcpy(model_name, model_string, sizeof(model_name)); } -static void panic_display_model_name(void) { +void panic_display_model_name(void) { char tmp_model_name[sizeof(model_name)]; if (ml_nofault_copy((vm_offset_t) &model_name, (vm_offset_t) &tmp_model_name, sizeof(model_name)) != sizeof(model_name)) @@ -509,7 +510,7 @@ static void panic_display_model_name(void) { kdb_printf("System model name: %s\n", tmp_model_name); } -static void panic_display_kernel_uuid(void) { +void panic_display_kernel_uuid(void) { char tmp_kernel_uuid[sizeof(kernel_uuid_string)]; if (ml_nofault_copy((vm_offset_t) &kernel_uuid_string, (vm_offset_t) &tmp_kernel_uuid, sizeof(kernel_uuid_string)) != sizeof(kernel_uuid_string)) @@ -628,6 +629,8 @@ __private_extern__ void panic_display_ecc_errors() #if CONFIG_ZLEAKS extern boolean_t panic_include_ztrace; extern struct ztrace* top_ztrace; +void panic_print_symbol_name(vm_address_t search); + /* * Prints the backtrace most suspected of being a leaker, if we paniced in the zone allocator. * top_ztrace and panic_include_ztrace comes from osfmk/kern/zalloc.c @@ -636,6 +639,9 @@ __private_extern__ void panic_display_ztrace(void) { if(panic_include_ztrace == TRUE) { unsigned int i = 0; + boolean_t keepsyms = FALSE; + + PE_parse_boot_argn("keepsyms", &keepsyms, sizeof (keepsyms)); struct ztrace top_ztrace_copy; /* Make sure not to trip another panic if there's something wrong with memory */ @@ -643,7 +649,11 @@ __private_extern__ void panic_display_ztrace(void) kdb_printf("\nBacktrace suspected of leaking: (outstanding bytes: %lu)\n", (uintptr_t)top_ztrace_copy.zt_size); /* Print the backtrace addresses */ for (i = 0; (i < top_ztrace_copy.zt_depth && i < MAX_ZTRACE_DEPTH) ; i++) { - kdb_printf("%p\n", top_ztrace_copy.zt_stack[i]); + kdb_printf("%p ", top_ztrace_copy.zt_stack[i]); + if (keepsyms) { + panic_print_symbol_name((vm_address_t)top_ztrace_copy.zt_stack[i]); + } + kdb_printf("\n"); } /* Print any kexts in that backtrace, along with their link addresses so we can properly blame them */ kmod_panic_dump((vm_offset_t *)&top_ztrace_copy.zt_stack[0], top_ztrace_copy.zt_depth); diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h index 85acc47fd..407d4b4f2 100644 --- a/osfmk/kern/debug.h +++ b/osfmk/kern/debug.h @@ -299,6 +299,7 @@ extern unsigned int disable_debug_output; extern unsigned int panicDebugging; extern unsigned int logPanicDataToScreen; +extern unsigned int kdebug_serial; extern int db_run_mode; @@ -332,6 +333,8 @@ void panic_display_system_configuration(void); void panic_display_zprint(void); void panic_display_kernel_aslr(void); void panic_display_hibb(void); +void panic_display_model_name(void); +void panic_display_kernel_uuid(void); #if CONFIG_ZLEAKS void panic_display_ztrace(void); #endif /* CONFIG_ZLEAKS */ @@ -359,7 +362,8 @@ void panic_display_ecc_errors(void); * post-panic crashdump/paniclog * dump. */ -#define DB_NMI_BTN_ENA 0x8000 /* Enable button to directly trigger NMI */ +#define DB_NMI_BTN_ENA 0x8000 /* Enable button to directly trigger NMI */ +#define DB_PRT_KDEBUG 0x10000 /* kprintf KDEBUG traces */ #if DEBUG /* diff --git a/osfmk/kern/hv_support.c b/osfmk/kern/hv_support.c index 9d032d2d0..c60df9886 100644 --- a/osfmk/kern/hv_support.c +++ b/osfmk/kern/hv_support.c @@ -44,6 +44,7 @@ int hv_support_available = 0; hv_callbacks_t hv_callbacks = { .dispatch = NULL, /* thread is being dispatched for execution */ .preempt = NULL, /* thread is being preempted */ + .suspend = NULL, /* system is being suspended */ .thread_destroy = NULL, /* thread is being destroyed */ .task_destroy = NULL, /* task is being destroyed */ .volatile_state = NULL, /* thread state is becoming volatile */ @@ -142,7 +143,7 @@ hv_mp_notify(void) { lck_mtx_unlock(hv_support_lck_mtx); break; } else { - hv_callbacks.memory_pressure(NULL); + hv_callbacks.memory_pressure(); } lck_mtx_unlock(hv_support_lck_mtx); } @@ -244,6 +245,7 @@ hv_release_callbacks(void) { hv_callbacks = (hv_callbacks_t) { .dispatch = NULL, .preempt = NULL, + .suspend = NULL, .thread_destroy = NULL, .task_destroy = NULL, .volatile_state = NULL, @@ -254,6 +256,14 @@ hv_release_callbacks(void) { lck_mtx_unlock(hv_support_lck_mtx); } +/* system suspend notification */ +void +hv_suspend(void) { + if (hv_callbacks_enabled) { + hv_callbacks.suspend(); + } +} + /* dispatch hv_task_trap/hv_thread_trap syscalls to trap handlers, fail for invalid index or absence of trap handlers, trap handler is responsible for validating targets */ diff --git a/osfmk/kern/hv_support.h b/osfmk/kern/hv_support.h index 485654f70..aaedb76ae 100644 --- a/osfmk/kern/hv_support.h +++ b/osfmk/kern/hv_support.h @@ -45,9 +45,7 @@ typedef enum { HV_THREAD_TRAP = 1 } hv_trap_type_t; -typedef kern_return_t (*hv_trap_t) (void *thread_target, uint64_t arg); -typedef void (*hv_callback_0_t)(void *target); -typedef void (*hv_callback_1_t)(void *target, int argument); +typedef kern_return_t (*hv_trap_t) (void *target, uint64_t arg); typedef struct { const hv_trap_t *traps; @@ -55,12 +53,13 @@ typedef struct { } hv_trap_table_t; typedef struct { - hv_callback_0_t dispatch; - hv_callback_0_t preempt; - hv_callback_0_t thread_destroy; - hv_callback_0_t task_destroy; - hv_callback_1_t volatile_state; - hv_callback_0_t memory_pressure; + void (*dispatch)(void *vcpu); + void (*preempt)(void *vcpu); + void (*suspend)(void); + void (*thread_destroy)(void *vcpu); + void (*task_destroy)(void *vm); + void (*volatile_state)(void *vcpu, int state); + void (*memory_pressure)(void); } hv_callbacks_t; extern hv_callbacks_t hv_callbacks; @@ -79,7 +78,8 @@ extern kern_return_t hv_set_traps(hv_trap_type_t trap_type, const hv_trap_t *traps, unsigned trap_count); extern void hv_release_traps(hv_trap_type_t trap_type); extern kern_return_t hv_set_callbacks(hv_callbacks_t callbacks); -extern void hv_release_callbacks(void) ; +extern void hv_release_callbacks(void); +extern void hv_suspend(void); extern kern_return_t hv_task_trap(uint64_t index, uint64_t arg); extern kern_return_t hv_thread_trap(uint64_t index, uint64_t arg); diff --git a/osfmk/kern/sfi.c b/osfmk/kern/sfi.c index 725164c77..819c6c686 100644 --- a/osfmk/kern/sfi.c +++ b/osfmk/kern/sfi.c @@ -162,6 +162,7 @@ struct sfi_class_state { uint64_t off_time_interval; timer_call_data_t on_timer; + uint64_t on_timer_deadline; boolean_t on_timer_programmed; boolean_t class_sfi_is_enabled; @@ -335,12 +336,15 @@ static void sfi_timer_global_off( /* Push out on-timer */ on_timer_deadline = now + sfi_classes[i].off_time_interval; + sfi_classes[i].on_timer_deadline = on_timer_deadline; + timer_call_enter1(&sfi_classes[i].on_timer, NULL, on_timer_deadline, TIMER_CALL_SYS_CRITICAL); } else { /* If this class no longer needs SFI, make sure the timer is cancelled */ sfi_classes[i].class_in_on_phase = TRUE; if (sfi_classes[i].on_timer_programmed) { sfi_classes[i].on_timer_programmed = FALSE; + sfi_classes[i].on_timer_deadline = ~0ULL; timer_call_cancel(&sfi_classes[i].on_timer); } } @@ -420,7 +424,10 @@ static void sfi_timer_per_class_on( * Since we have the sfi_lock held and have changed "class_in_on_phase", we expect * no new threads to be put on this wait queue until the global "off timer" has fired. */ + sfi_class->class_in_on_phase = TRUE; + sfi_class->on_timer_programmed = FALSE; + kret = wait_queue_wakeup64_all(&sfi_class->wait_queue, CAST_EVENT64_T(sfi_class_id), THREAD_AWAKENED); @@ -532,6 +539,52 @@ kern_return_t sfi_window_cancel(void) return (KERN_SUCCESS); } +/* Defers SFI off and per-class on timers (if live) by the specified interval + * in Mach Absolute Time Units. Currently invoked to align with the global + * forced idle mechanism. Making some simplifying assumptions, the iterative GFI + * induced SFI on+off deferrals form a geometric series that converges to yield + * an effective SFI duty cycle that is scaled by the GFI duty cycle. Initial phase + * alignment and congruency of the SFI/GFI periods can distort this to some extent. + */ + +kern_return_t sfi_defer(uint64_t sfi_defer_matus) +{ + spl_t s; + kern_return_t kr = KERN_FAILURE; + s = splsched(); + + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_GLOBAL_DEFER), sfi_defer_matus, 0, 0, 0, 0); + + simple_lock(&sfi_lock); + if (!sfi_is_enabled) { + goto sfi_defer_done; + } + + assert(sfi_next_off_deadline != 0); + + sfi_next_off_deadline += sfi_defer_matus; + timer_call_enter1(&sfi_timer_call_entry, NULL, sfi_next_off_deadline, TIMER_CALL_SYS_CRITICAL); + + int i; + for (i = 0; i < MAX_SFI_CLASS_ID; i++) { + if (sfi_classes[i].class_sfi_is_enabled) { + if (sfi_classes[i].on_timer_programmed) { + uint64_t new_on_deadline = sfi_classes[i].on_timer_deadline + sfi_defer_matus; + sfi_classes[i].on_timer_deadline = new_on_deadline; + timer_call_enter1(&sfi_classes[i].on_timer, NULL, new_on_deadline, TIMER_CALL_SYS_CRITICAL); + } + } + } + + kr = KERN_SUCCESS; +sfi_defer_done: + simple_unlock(&sfi_lock); + + splx(s); + + return (kr); +} + kern_return_t sfi_get_window(uint64_t *window_usecs) { diff --git a/osfmk/kern/sfi.h b/osfmk/kern/sfi.h index 385b57cf0..7ac6259b3 100644 --- a/osfmk/kern/sfi.h +++ b/osfmk/kern/sfi.h @@ -64,6 +64,7 @@ ast_t sfi_processor_needs_ast(processor_t processor); void sfi_ast(thread_t thread); void sfi_reevaluate(thread_t thread); +kern_return_t sfi_defer(uint64_t); #endif /* MACH_KERNEL_PRIVATE */ #endif /* _KERN_SFI_H_ */ diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c index e67c01a8b..6ddd0389c 100644 --- a/osfmk/kern/startup.c +++ b/osfmk/kern/startup.c @@ -192,6 +192,7 @@ unsigned int new_nkdbufs = 0; unsigned int wake_nkdbufs = 0; unsigned int write_trace_on_panic = 0; unsigned int trace_typefilter = 0; +boolean_t trace_serial = FALSE; /* mach leak logging */ int log_leaks = 0; @@ -480,6 +481,11 @@ kernel_bootstrap_thread(void) #endif #if (defined(__i386__) || defined(__x86_64__)) + if (kdebug_serial) { + new_nkdbufs = 1; + if (trace_typefilter == 0) + trace_typefilter = 1; + } if (turn_on_log_leaks && !new_nkdbufs) new_nkdbufs = 200000; if (trace_typefilter) diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c index b9a7ae0eb..8b7d863b7 100644 --- a/osfmk/kern/thread.c +++ b/osfmk/kern/thread.c @@ -309,6 +309,12 @@ thread_bootstrap(void) #endif /* HYPERVISOR */ thread_template.t_chud = 0; + +#if (DEVELOPMENT || DEBUG) + thread_template.t_page_creation_throttled_hard = 0; + thread_template.t_page_creation_throttled_soft = 0; +#endif /* DEVELOPMENT || DEBUG */ + thread_template.t_page_creation_throttled = 0; thread_template.t_page_creation_count = 0; thread_template.t_page_creation_time = 0; @@ -663,7 +669,7 @@ void thread_terminate_enqueue( thread_t thread) { - KERNEL_DEBUG_CONSTANT(TRACEDBG_CODE(DBG_TRACE_DATA, TRACE_DATA_THREAD_TERMINATE) | DBG_FUNC_NONE, thread->thread_id, 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT(TRACE_DATA_THREAD_TERMINATE | DBG_FUNC_NONE, thread->thread_id, 0, 0, 0, 0); simple_lock(&thread_terminate_lock); enqueue_tail(&thread_terminate_queue, (queue_entry_t)thread); diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index a81bc6c8d..0b5061a33 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -411,6 +411,12 @@ struct thread { clock_sec_t t_page_creation_time; uint32_t t_page_creation_count; + uint32_t t_page_creation_throttled; +#if (DEVELOPMENT || DEBUG) + uint64_t t_page_creation_throttled_hard; + uint64_t t_page_creation_throttled_soft; +#endif /* DEVELOPMENT || DEBUG */ + #define T_CHUD_MARKED 0x01 /* this thread is marked by CHUD */ #define T_IN_CHUD 0x02 /* this thread is already in a CHUD handler */ #define THREAD_PMC_FLAG 0x04 /* Bit in "t_chud" signifying PMC interest */ diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index 7d3a98630..abd911562 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -393,9 +393,7 @@ __END_DECLS #define CPUFAMILY_INTEL_SANDYBRIDGE 0x5490b78c #define CPUFAMILY_INTEL_IVYBRIDGE 0x1f65e835 #define CPUFAMILY_INTEL_HASWELL 0x10b282dc -#if !defined(XNU_HIDE_SEED) #define CPUFAMILY_INTEL_BROADWELL 0x582ed09c -#endif /* not XNU_HIDE_SEED */ #define CPUFAMILY_ARM_9 0xe73283ae #define CPUFAMILY_ARM_11 0x8ff620d8 #define CPUFAMILY_ARM_XSCALE 0x53b005f5 diff --git a/osfmk/vm/vm_compressor.c b/osfmk/vm/vm_compressor.c index 402323060..224858282 100644 --- a/osfmk/vm/vm_compressor.c +++ b/osfmk/vm/vm_compressor.c @@ -60,6 +60,7 @@ int vm_compressor_mode = VM_PAGER_COMPRESSOR_WITH_SWAP; int vm_scale = 16; +int vm_compressor_is_active = 0; int vm_compression_limit = 0; extern boolean_t vm_swap_up; @@ -464,6 +465,9 @@ vm_compressor_init(void) vm_compressor_swap_init(); } + if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) + vm_compressor_is_active = 1; + #if CONFIG_FREEZE memorystatus_freeze_enabled = TRUE; #endif /* CONFIG_FREEZE */ @@ -764,9 +768,9 @@ void c_seg_free_locked(c_segment_t c_seg) { int segno, i; - int pages_populated; + int pages_populated = 0; int32_t *c_buffer = NULL; - uint64_t c_swap_handle; + uint64_t c_swap_handle = 0; assert(!c_seg->c_on_minorcompact_q); @@ -1017,9 +1021,7 @@ struct { } c_seg_major_compact_stats; -#define C_MAJOR_COMPACTION_AGE_APPROPRIATE 30 -#define C_MAJOR_COMPACTION_OLD_ENOUGH 300 -#define C_MAJOR_COMPACTION_SIZE_APPROPRIATE ((C_SEG_BUFSIZE * 80) / 100) +#define C_MAJOR_COMPACTION_SIZE_APPROPRIATE ((C_SEG_BUFSIZE * 90) / 100) boolean_t @@ -2398,7 +2400,7 @@ static int c_compress_page(char *src, c_slot_mapping_t slot_ptr, c_segment_t *current_chead, char *scratch_buf) { int c_size; - int c_rounded_size; + int c_rounded_size = 0; int max_csize; c_slot_t cs; c_segment_t c_seg; diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index e39ebf9b1..bb506cedd 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -135,11 +135,15 @@ uint64_t vm_hard_throttle_threshold; #define NEED_TO_HARD_THROTTLE_THIS_TASK() (vm_wants_task_throttled(current_task()) || \ (vm_page_free_count < vm_page_throttle_limit && \ - proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED)) + proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) > THROTTLE_LEVEL_THROTTLED)) -#define HARD_THROTTLE_DELAY 20000 /* 20000 us == 20 ms */ -#define SOFT_THROTTLE_DELAY 2000 /* 2000 us == 2 ms */ +#define HARD_THROTTLE_DELAY 5000 /* 5000 us == 5 ms */ +#define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */ + +#define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6 +#define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000 + boolean_t current_thread_aborted(void); @@ -544,8 +548,13 @@ vm_fault_deactivate_behind( } +#if (DEVELOPMENT || DEBUG) +uint32_t vm_page_creation_throttled_hard = 0; +uint32_t vm_page_creation_throttled_soft = 0; +#endif /* DEVELOPMENT || DEBUG */ + static int -vm_page_throttled(void) +vm_page_throttled(boolean_t page_kept) { clock_sec_t elapsed_sec; clock_sec_t tv_sec; @@ -556,21 +565,31 @@ vm_page_throttled(void) if (thread->options & TH_OPT_VMPRIV) return (0); - thread->t_page_creation_count++; - - if (NEED_TO_HARD_THROTTLE_THIS_TASK()) + if (thread->t_page_creation_throttled) { + thread->t_page_creation_throttled = 0; + + if (page_kept == FALSE) + goto no_throttle; + } + if (NEED_TO_HARD_THROTTLE_THIS_TASK()) { +#if (DEVELOPMENT || DEBUG) + thread->t_page_creation_throttled_hard++; + OSAddAtomic(1, &vm_page_creation_throttled_hard); +#endif /* DEVELOPMENT || DEBUG */ return (HARD_THROTTLE_DELAY); + } if ((vm_page_free_count < vm_page_throttle_limit || ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && SWAPPER_NEEDS_TO_UNTHROTTLE())) && - thread->t_page_creation_count > vm_page_creation_throttle) { + thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) { clock_get_system_microtime(&tv_sec, &tv_usec); elapsed_sec = tv_sec - thread->t_page_creation_time; - if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) { + if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS || + (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) { - if (elapsed_sec >= 60) { + if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) { /* * we'll reset our stats to give a well behaved app * that was unlucky enough to accumulate a bunch of pages @@ -581,22 +600,35 @@ vm_page_throttled(void) * will remain in the throttled state */ thread->t_page_creation_time = tv_sec; - thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5; + thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1); } ++vm_page_throttle_count; - if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED()) + thread->t_page_creation_throttled = 1; + + if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED()) { +#if (DEVELOPMENT || DEBUG) + thread->t_page_creation_throttled_hard++; + OSAddAtomic(1, &vm_page_creation_throttled_hard); +#endif /* DEVELOPMENT || DEBUG */ return (HARD_THROTTLE_DELAY); - else + } else { +#if (DEVELOPMENT || DEBUG) + thread->t_page_creation_throttled_soft++; + OSAddAtomic(1, &vm_page_creation_throttled_soft); +#endif /* DEVELOPMENT || DEBUG */ return (SOFT_THROTTLE_DELAY); + } } thread->t_page_creation_time = tv_sec; thread->t_page_creation_count = 0; } +no_throttle: + thread->t_page_creation_count++; + return (0); } - /* * check for various conditions that would * prevent us from creating a ZF page... @@ -606,7 +638,7 @@ vm_page_throttled(void) * object == m->object */ static vm_fault_return_t -vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state) +vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state, boolean_t page_throttle) { int throttle_delay; @@ -647,7 +679,7 @@ vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t int return (VM_FAULT_RETRY); } } - if ((throttle_delay = vm_page_throttled())) { + if (page_throttle == TRUE && (throttle_delay = vm_page_throttled(FALSE))) { /* * we're throttling zero-fills... * treat this as if we couldn't grab a page @@ -1150,7 +1182,7 @@ vm_fault_page( * fault cleanup in the case of an error condition * including resetting the thread_interrupt_level */ - error = vm_fault_check(object, m, first_m, interruptible_state); + error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE); if (error != VM_FAULT_SUCCESS) return (error); @@ -1560,6 +1592,21 @@ vm_fault_page( 0, &compressed_count_delta); + if (type_of_fault == NULL) { + int throttle_delay; + + /* + * we weren't called from vm_fault, so we + * need to apply page creation throttling + * do it before we re-acquire any locks + */ + if (my_fault_type == DBG_COMPRESSOR_FAULT) { + if ((throttle_delay = vm_page_throttled(TRUE))) { + VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0); + delay(throttle_delay); + } + } + } vm_object_lock(object); assert(object->paging_in_progress > 0); @@ -1856,7 +1903,7 @@ dont_look_for_page: * fault cleanup in the case of an error condition * including resetting the thread_interrupt_level */ - error = vm_fault_check(object, m, first_m, interruptible_state); + error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE); if (error != VM_FAULT_SUCCESS) return (error); @@ -3885,31 +3932,6 @@ FastPmapEnter: */ assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE); - if ((throttle_delay = vm_page_throttled())) { - /* - * drop all of our locks... - * wait until the free queue is - * pumped back up and then - * redrive the fault - */ - if (object != cur_object) - vm_object_unlock(cur_object); - vm_object_unlock(object); - vm_map_unlock_read(map); - if (real_map != map) - vm_map_unlock(real_map); - - VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0); - - delay(throttle_delay); - - if (!current_thread_aborted() && vm_page_wait((change_wiring) ? - THREAD_UNINT : - THREAD_ABORTSAFE)) - goto RetryFault; - kr = KERN_ABORTED; - goto done; - } /* * If objects match, then * object->copy must not be NULL (else control @@ -4268,31 +4290,6 @@ FastPmapEnter: kr = KERN_MEMORY_ERROR; goto done; } - if ((throttle_delay = vm_page_throttled())) { - /* - * drop all of our locks... - * wait until the free queue is - * pumped back up and then - * redrive the fault - */ - if (object != cur_object) - vm_object_unlock(cur_object); - vm_object_unlock(object); - vm_map_unlock_read(map); - if (real_map != map) - vm_map_unlock(real_map); - - VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0); - - delay(throttle_delay); - - if (!current_thread_aborted() && vm_page_wait((change_wiring) ? - THREAD_UNINT : - THREAD_ABORTSAFE)) - goto RetryFault; - kr = KERN_ABORTED; - goto done; - } if (vm_backing_store_low) { /* * we are protecting the system from @@ -4829,12 +4826,27 @@ done: thread_interrupt_level(interruptible_state); /* - * Only throttle on faults which cause a pagein. + * Only I/O throttle on faults which cause a pagein/swapin. */ if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) { throttle_lowpri_io(1); - } + } else { + if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) { + if ((throttle_delay = vm_page_throttled(TRUE))) { + + if (vm_debug_events) { + if (type_of_fault == DBG_COMPRESSOR_FAULT) + VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0); + else if (type_of_fault == DBG_COW_FAULT) + VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0); + else + VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0); + } + delay(throttle_delay); + } + } + } KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, ((uint64_t)vaddr >> 32), diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index a22b763c1..ca11e1bae 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -6199,6 +6199,7 @@ vm_map_copy_copy( */ new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; *new_copy = *copy; if (copy->type == VM_MAP_COPY_ENTRY_LIST) { @@ -6847,6 +6848,7 @@ start_overwrite: /* destroyed after successful copy_overwrite */ copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; vm_map_copy_first_entry(copy) = vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy); @@ -7150,6 +7152,7 @@ vm_map_copy_overwrite( * Extract "head_copy" out of "copy". */ head_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + head_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; vm_map_copy_first_entry(head_copy) = vm_map_copy_to_entry(head_copy); vm_map_copy_last_entry(head_copy) = @@ -7191,6 +7194,7 @@ vm_map_copy_overwrite( * Extract "tail_copy" out of "copy". */ tail_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + tail_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; vm_map_copy_first_entry(tail_copy) = vm_map_copy_to_entry(tail_copy); vm_map_copy_last_entry(tail_copy) = @@ -8657,6 +8661,7 @@ vm_map_copyin_common( */ copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; vm_map_copy_first_entry(copy) = vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy); copy->type = VM_MAP_COPY_ENTRY_LIST; @@ -9392,6 +9397,7 @@ vm_map_copy_extract( */ copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; vm_map_copy_first_entry(copy) = vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy); copy->type = VM_MAP_COPY_ENTRY_LIST; @@ -9443,6 +9449,7 @@ vm_map_copyin_object( */ copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; copy->type = VM_MAP_COPY_OBJECT; copy->cpy_object = object; copy->offset = offset; diff --git a/osfmk/vm/vm_map_store.c b/osfmk/vm/vm_map_store.c index 3a2b381f0..288bafba1 100644 --- a/osfmk/vm/vm_map_store.c +++ b/osfmk/vm/vm_map_store.c @@ -36,12 +36,23 @@ first_free_is_valid_store( vm_map_t map ) } #endif +boolean_t +vm_map_store_has_RB_support( struct vm_map_header *hdr ) +{ + if ((void*)hdr->rb_head_store.rbh_root == (void*)(int)SKIP_RB_TREE) { + return FALSE; + } + return TRUE; +} + void vm_map_store_init( struct vm_map_header *hdr ) { vm_map_store_init_ll( hdr ); #ifdef VM_MAP_STORE_USE_RB - vm_map_store_init_rb( hdr ); + if (vm_map_store_has_RB_support( hdr )) { + vm_map_store_init_rb( hdr ); + } #endif } @@ -54,7 +65,12 @@ vm_map_store_lookup_entry( #ifdef VM_MAP_STORE_USE_LL return (vm_map_store_lookup_entry_ll( map, address, entry )); #elif defined VM_MAP_STORE_USE_RB - return (vm_map_store_lookup_entry_rb( map, address, entry )); + if (vm_map_store_has_RB_support( &map->hdr )) { + return (vm_map_store_lookup_entry_rb( map, address, entry )); + } else { + panic("VM map lookups need RB tree support.\n"); + return FALSE; /* For compiler warning.*/ + } #endif } @@ -81,7 +97,9 @@ void vm_map_store_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_ { vm_map_store_copy_insert_ll(map, after_where, copy); #ifdef VM_MAP_STORE_USE_RB - vm_map_store_copy_insert_rb(map, after_where, copy); + if (vm_map_store_has_RB_support( &map->hdr )) { + vm_map_store_copy_insert_rb(map, after_where, copy); + } #endif } @@ -104,7 +122,9 @@ _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_wh assert(entry->vme_start < entry->vme_end); vm_map_store_entry_link_ll(mapHdr, after_where, entry); #ifdef VM_MAP_STORE_USE_RB - vm_map_store_entry_link_rb(mapHdr, after_where, entry); + if (vm_map_store_has_RB_support( mapHdr )) { + vm_map_store_entry_link_rb(mapHdr, after_where, entry); + } #endif #if MAP_ENTRY_INSERTION_DEBUG fastbacktrace(&entry->vme_insertion_bt[0], @@ -126,7 +146,9 @@ vm_map_store_entry_link( vm_map_t map, vm_map_entry_t after_where, vm_map_entry_ } else { update_first_free_ll(VMEL_map, VMEL_map->first_free); #ifdef VM_MAP_STORE_USE_RB - update_first_free_rb(VMEL_map, VMEL_map->first_free); + if (vm_map_store_has_RB_support( &VMEL_map->hdr )) { + update_first_free_rb(VMEL_map, VMEL_map->first_free); + } #endif } } @@ -136,7 +158,9 @@ _vm_map_store_entry_unlink( struct vm_map_header * mapHdr, vm_map_entry_t entry) { vm_map_store_entry_unlink_ll(mapHdr, entry); #ifdef VM_MAP_STORE_USE_RB - vm_map_store_entry_unlink_rb(mapHdr, entry); + if (vm_map_store_has_RB_support( mapHdr )) { + vm_map_store_entry_unlink_rb(mapHdr, entry); + } #endif } @@ -158,7 +182,9 @@ vm_map_store_entry_unlink( vm_map_t map, vm_map_entry_t entry) vm_map_store_update( map, entry, VM_MAP_ENTRY_DELETE); update_first_free_ll(VMEU_map, VMEU_first_free); #ifdef VM_MAP_STORE_USE_RB - update_first_free_rb(VMEU_map, VMEU_first_free); + if (vm_map_store_has_RB_support( &VMEU_map->hdr )) { + update_first_free_rb(VMEU_map, VMEU_first_free); + } #endif } @@ -168,7 +194,9 @@ vm_map_store_copy_reset( vm_map_copy_t copy,vm_map_entry_t entry) int nentries = copy->cpy_hdr.nentries; vm_map_store_copy_reset_ll(copy, entry, nentries); #ifdef VM_MAP_STORE_USE_RB - vm_map_store_copy_reset_rb(copy, entry, nentries); + if (vm_map_store_has_RB_support( ©->c_u.hdr )) { + vm_map_store_copy_reset_rb(copy, entry, nentries); + } #endif } @@ -177,6 +205,8 @@ vm_map_store_update_first_free( vm_map_t map, vm_map_entry_t first_free) { update_first_free_ll(map, first_free); #ifdef VM_MAP_STORE_USE_RB - update_first_free_rb(map, first_free); + if (vm_map_store_has_RB_support( &map->hdr )) { + update_first_free_rb(map, first_free); + } #endif } diff --git a/osfmk/vm/vm_map_store.h b/osfmk/vm/vm_map_store.h index dab7746ed..b6c12fe19 100644 --- a/osfmk/vm/vm_map_store.h +++ b/osfmk/vm/vm_map_store.h @@ -114,6 +114,8 @@ struct vm_map_store { (map)->hint = (value); \ MACRO_END +#define SKIP_RB_TREE 0xBAADC0D1 + #define VM_MAP_ENTRY_CREATE 1 #define VM_MAP_ENTRY_DELETE 2 @@ -130,6 +132,7 @@ void vm_map_store_copy_reset( struct vm_map_copy*, struct vm_map_entry*); #if MACH_ASSERT boolean_t first_free_is_valid_store( struct _vm_map*); #endif +boolean_t vm_map_store_has_RB_support( struct vm_map_header *hdr ); #endif /* _VM_VM_MAP_STORE_H */ diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 7d78f66b2..080ffb5e7 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -1202,7 +1202,6 @@ struct flow_control { uint32_t vm_pageout_considered_page = 0; uint32_t vm_page_filecache_min = 0; -#define VM_PAGE_FILECACHE_MIN 50000 #define ANONS_GRABBED_LIMIT 2 /* @@ -1664,6 +1663,16 @@ return_from_scan: if (cache_evict_throttle) cache_evict_throttle--; + /* + * don't let the filecache_min fall below 33% of available memory... + * + * on systems w/o the compressor/swapper, the filecache is always + * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY + * since most (if not all) of the anonymous pages are in the + * throttled queue (which isn't counted as available) which + * effectively disables this filter + */ + vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 3); exceeded_burst_throttle = FALSE; /* @@ -1961,6 +1970,15 @@ consider_inactive: page_prev_state = PAGE_STATE_INACTIVE; anons_grabbed = 0; + if (vm_page_pageable_external_count < vm_page_filecache_min) { + if ((++reactivated_this_call % 100)) + goto must_activate_page; + /* + * steal 1% of the file backed pages even if + * we are under the limit that has been set + * for a healthy filecache + */ + } break; } } @@ -2407,6 +2425,7 @@ reactivate_page: vm_page_deactivate(m); vm_pageout_inactive_deactivated++; } else { +must_activate_page: /* * The page was/is being used, so put back on active list. */ @@ -2767,7 +2786,6 @@ vm_page_free_reserve( vm_page_free_target = vm_page_free_min + 5; vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 3); - vm_page_creation_throttle = vm_page_free_target * 3; } /* @@ -3763,11 +3781,6 @@ void vm_pageout_reinit_tuneables(void); void vm_pageout_reinit_tuneables(void) { - vm_page_filecache_min = (uint32_t) (max_mem / PAGE_SIZE) / 15; - - if (vm_page_filecache_min < VM_PAGE_FILECACHE_MIN) - vm_page_filecache_min = VM_PAGE_FILECACHE_MIN; - vm_compressor_minorcompact_threshold_divisor = 18; vm_compressor_majorcompact_threshold_divisor = 22; vm_compressor_unthrottle_threshold_divisor = 32; @@ -3847,12 +3860,6 @@ vm_pageout(void) if (vm_pageout_burst_inactive_throttle == 0) vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE; -#if !CONFIG_JETSAM - vm_page_filecache_min = (uint32_t) (max_mem / PAGE_SIZE) / 20; - if (vm_page_filecache_min < VM_PAGE_FILECACHE_MIN) - vm_page_filecache_min = VM_PAGE_FILECACHE_MIN; -#endif - /* * Set kernel task to low backing store privileged * status @@ -4314,11 +4321,10 @@ upl_set_decmp_info(upl_t upl, upl_t src_upl) } src_upl->decmp_io_upl = (void *)upl; src_upl->ref_count++; - upl_unlock(src_upl); upl->flags |= UPL_DECMP_REAL_IO; upl->decmp_io_upl = (void *)src_upl; - + upl_unlock(src_upl); } #endif /* CONFIG_IOSCHED */ diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index 513877c18..9d81f5070 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -360,7 +360,6 @@ ppnum_t max_valid_low_ppnum = 0xffffffff; unsigned int vm_page_free_target = 0; unsigned int vm_page_free_min = 0; unsigned int vm_page_throttle_limit = 0; -uint32_t vm_page_creation_throttle = 0; unsigned int vm_page_inactive_target = 0; unsigned int vm_page_anonymous_min = 0; unsigned int vm_page_inactive_min = 0; @@ -5122,7 +5121,6 @@ hibernate_flush_queue(queue_head_t *q, int qcount) goto reenter_pg_on_q; } - vm_pageout_scan_wants_object = m_object; vm_page_unlock_queues(); mutex_pause(try_failed_count++); @@ -5132,7 +5130,6 @@ hibernate_flush_queue(queue_head_t *q, int qcount) continue; } else { l_object = m_object; - vm_pageout_scan_wants_object = VM_OBJECT_NULL; } } if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error) { @@ -5198,7 +5195,6 @@ hibernate_flush_queue(queue_head_t *q, int qcount) vm_object_unlock(l_object); l_object = NULL; } - vm_pageout_scan_wants_object = VM_OBJECT_NULL; while (retval == 0) { @@ -5271,7 +5267,6 @@ next_pg: vm_object_unlock(l_object); l_object = NULL; } - vm_pageout_scan_wants_object = VM_OBJECT_NULL; vm_page_unlock_queues(); diff --git a/osfmk/x86_64/copyio.c b/osfmk/x86_64/copyio.c index 641141806..66a4dd7ac 100644 --- a/osfmk/x86_64/copyio.c +++ b/osfmk/x86_64/copyio.c @@ -74,6 +74,60 @@ extern int _bcopystr(const void *, void *, vm_size_t, vm_size_t *); #define COPYINPHYS 3 /* from user virtual to kernel physical */ #define COPYOUTPHYS 4 /* from kernel physical to user virtual */ +#if DEVELOPMENT +typedef struct { + uint64_t timestamp; + thread_t thread; + uintptr_t cr4; + uint8_t cpuid; + uint8_t smap_state; + uint8_t copyio_active; +} smaplog_entry_t; + +#define SMAPLOG_BUFFER_SIZE (50) +static smaplog_entry_t smaplog_cbuf[SMAPLOG_BUFFER_SIZE]; +static uint32_t smaplog_head = 0; + +static void +smaplog_add_entry(boolean_t enabling) +{ + uint32_t index = 0; + thread_t thread = current_thread(); + + do { + index = smaplog_head; + } while (!OSCompareAndSwap(index, (index + 1) % SMAPLOG_BUFFER_SIZE, &smaplog_head)); + + assert(index < SMAPLOG_BUFFER_SIZE); + assert(smaplog_head < SMAPLOG_BUFFER_SIZE); + assert(thread); + + smaplog_cbuf[index].timestamp = mach_absolute_time(); + smaplog_cbuf[index].thread = thread; + smaplog_cbuf[index].cpuid = cpu_number(); + smaplog_cbuf[index].cr4 = get_cr4(); + smaplog_cbuf[index].smap_state = enabling; + smaplog_cbuf[index].copyio_active = (thread->machine.specFlags & CopyIOActive) ? 1 : 0; +} +#endif /* DEVELOPMENT */ + +extern boolean_t pmap_smap_enabled; +static inline void user_access_enable(void) { + if (pmap_smap_enabled) { + stac(); +#if DEVELOPMENT + smaplog_add_entry(TRUE); +#endif + } +} +static inline void user_access_disable(void) { + if (pmap_smap_enabled) { + clac(); +#if DEVELOPMENT + smaplog_add_entry(FALSE); +#endif + } +} static int copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, @@ -123,6 +177,7 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, */ recursive_CopyIOActive = thread->machine.specFlags & CopyIOActive; thread->machine.specFlags |= CopyIOActive; + user_access_enable(); if (no_shared_cr3) { istate = ml_set_interrupts_enabled(FALSE); if (get_cr3_base() != pmap->pm_cr3) @@ -211,6 +266,7 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, break; } + user_access_disable(); if (!recursive_CopyIOActive) { thread->machine.specFlags &= ~CopyIOActive; } diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index 8e1b55902..51b1b3348 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -320,6 +320,13 @@ pmap_cpu_init(void) pmap_smep_enabled = TRUE; } } + if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) { + boolean_t nsmap; + if (!PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) { + set_cr4(get_cr4() | CR4_SMAP); + pmap_smap_enabled = TRUE; + } + } if (cdp->cpu_fixed_pmcs_enabled) { boolean_t enable = TRUE; @@ -448,6 +455,8 @@ pmap_bootstrap( if (pmap_smep_enabled) printf("PMAP: Supervisor Mode Execute Protection enabled\n"); + if (pmap_smap_enabled) + printf("PMAP: Supervisor Mode Access Protection enabled\n"); #if DEBUG printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]); diff --git a/pexpert/pexpert/i386/boot.h b/pexpert/pexpert/i386/boot.h index d79e4994a..ad6430635 100644 --- a/pexpert/pexpert/i386/boot.h +++ b/pexpert/pexpert/i386/boot.h @@ -175,7 +175,8 @@ typedef struct boot_args { uint32_t pciConfigSpaceEndBusNumber; uint32_t csrActiveConfig; uint32_t csrPendingConfig; - uint32_t __reserved4[728]; + uint32_t boot_SMC_plimit; + uint32_t __reserved4[727]; } boot_args; diff --git a/tools/tests/xnu_quick_test/tests.c b/tools/tests/xnu_quick_test/tests.c index ad892f2ae..8d1e8f460 100644 --- a/tools/tests/xnu_quick_test/tests.c +++ b/tools/tests/xnu_quick_test/tests.c @@ -900,7 +900,7 @@ int access_chmod_fchmod_test( void * the_argp ) char * my_pathp = NULL; - uid_t euid,ruid; + uid_t ruid; struct stat my_sb; FILE * file_handle; @@ -987,10 +987,13 @@ int access_chmod_fchmod_test( void * the_argp ) file_handle = fopen(FILE_NOTME, "w"); fclose(file_handle); - /* Currently running as root (through setreuid manipulation), switch to running as the current user. */ - euid = geteuid(); + /* Currently running as root (through settid manipulation), switch to running as the current user. */ ruid = getuid(); - setreuid(ruid, ruid); + my_err = syscall(SYS_settid, ruid, KAUTH_GID_NONE); + if (my_err != 0) { + printf("Failed to settid to non-root with error %d:%s\n", errno, strerror(errno)); + goto test_failed_exit; + } /* Create a file that the current user owns */ file_handle = fopen(FILE_ME, "w"); @@ -1033,8 +1036,11 @@ int access_chmod_fchmod_test( void * the_argp ) } /* Reset to running as root */ - setreuid(ruid, euid); - + my_err = syscall(SYS_settid, KAUTH_UID_NONE, KAUTH_GID_NONE); + if (my_err != 0) { + printf("Failed to revert to root using settid with error %d:%s\n", errno, strerror(errno)); + goto test_failed_exit; + } if(error_occurred == 1) { goto test_failed_exit; } @@ -5908,7 +5914,7 @@ int faccessat_fchmodat_fchmod_test( void * the_argp ) char * my_namep = NULL; char * my_pathp = NULL; - uid_t euid,ruid; + uid_t ruid; struct stat my_sb; FILE * file_handle; @@ -6044,10 +6050,13 @@ int faccessat_fchmodat_fchmod_test( void * the_argp ) file_handle = fopen(FILE_NOTME, "w"); fclose(file_handle); - /* Currently running as root (through setreuid manipulation), switch to running as the current user. */ - euid = geteuid(); + /* Currently running as root (through settid manipulation), switch to running as the current user. */ ruid = getuid(); - setreuid(ruid, ruid); + my_err = syscall(SYS_settid, ruid, KAUTH_GID_NONE); + if (my_err != 0) { + printf("Failed to settid to non-root with error %d:%s\n", errno, strerror(errno)); + goto test_failed_exit; + } /* Create a file that the current user owns */ file_handle = fopen(FILE_ME, "w"); @@ -6090,7 +6099,11 @@ int faccessat_fchmodat_fchmod_test( void * the_argp ) } /* Reset to running as root */ - setreuid(ruid, euid); + my_err = syscall(SYS_settid, KAUTH_UID_NONE, KAUTH_GID_NONE); + if (my_err != 0) { + printf("Failed to settid revert to root with error %d:%s\n", errno, strerror(errno)); + goto test_failed_exit; + } if(error_occurred == 1) { goto test_failed_exit;