]> git.saurik.com Git - apple/xnu.git/commitdiff
xnu-2782.20.48.tar.gz os-x-10103 v2782.20.48
authorApple <opensource@apple.com>
Fri, 25 Sep 2015 15:59:39 +0000 (15:59 +0000)
committerApple <opensource@apple.com>
Fri, 25 Sep 2015 15:59:39 +0000 (15:59 +0000)
83 files changed:
bsd/conf/files
bsd/dev/dtrace/dtrace.c
bsd/hfs/hfs.h
bsd/hfs/hfs_cnode.c
bsd/hfs/hfs_fsctl.h
bsd/hfs/hfs_fsinfo.c [new file with mode: 0644]
bsd/hfs/hfs_readwrite.c
bsd/hfs/hfs_vnops.c
bsd/hfs/hfscommon/Catalog/FileIDsServices.c
bsd/hfs/hfscommon/Misc/VolumeAllocation.c
bsd/hfs/hfscommon/headers/FileMgrInternal.h
bsd/kern/bsd_init.c
bsd/kern/kdebug.c
bsd/kern/kern_control.c
bsd/kern/kern_event.c
bsd/kern/kern_exec.c
bsd/kern/kern_exit.c
bsd/kern/kern_prot.c
bsd/kern/kern_sysctl.c
bsd/kern/mach_loader.c
bsd/kern/mach_loader.h
bsd/kern/makekdebugevents.py [new file with mode: 0755]
bsd/kern/proc_info.c
bsd/kern/sys_generic.c
bsd/kern/trace.codes
bsd/kern/uipc_socket.c
bsd/man/man2/kqueue.2
bsd/miscfs/specfs/spec_vnops.c
bsd/net/if_bridge.c
bsd/netinet/in_systm.h
bsd/netinet/ip_icmp.c
bsd/netinet/ip_input.c
bsd/netinet/tcp_cubic.c
bsd/netinet/tcp_debug.h
bsd/netinet6/in6_proto.c
bsd/netinet6/ip6_input.c
bsd/nfs/nfs_bio.c
bsd/nfs/nfs_vfsops.c
bsd/sys/Makefile
bsd/sys/dtrace.h
bsd/sys/dtrace_impl.h
bsd/sys/event.h
bsd/sys/kdebug.h
bsd/vfs/vfs_fsevents.c
bsd/vfs/vfs_lookup.c
bsd/vfs/vfs_syscalls.c
config/MasterVersion
config/Private.exports
libsyscall/mach/.gitignore [new file with mode: 0644]
osfmk/atm/atm.c
osfmk/device/device_init.c
osfmk/i386/AT386/model_dep.c
osfmk/i386/acpi.c
osfmk/i386/cpuid.c
osfmk/i386/cpuid.h
osfmk/i386/panic_hooks.c
osfmk/i386/panic_hooks.h
osfmk/i386/proc_reg.h
osfmk/i386/trap.c
osfmk/i386/trap.h
osfmk/ipc/mach_debug.c
osfmk/kern/bsd_kern.c
osfmk/kern/debug.c
osfmk/kern/debug.h
osfmk/kern/hv_support.c
osfmk/kern/hv_support.h
osfmk/kern/sfi.c
osfmk/kern/sfi.h
osfmk/kern/startup.c
osfmk/kern/thread.c
osfmk/kern/thread.h
osfmk/mach/machine.h
osfmk/vm/vm_compressor.c
osfmk/vm/vm_fault.c
osfmk/vm/vm_map.c
osfmk/vm/vm_map_store.c
osfmk/vm/vm_map_store.h
osfmk/vm/vm_pageout.c
osfmk/vm/vm_resident.c
osfmk/x86_64/copyio.c
osfmk/x86_64/pmap.c
pexpert/pexpert/i386/boot.h
tools/tests/xnu_quick_test/tests.c

index 54b4ef14d2cf5e85e963d0c761e0df4c643b0efb..d4ce218f8b52406760d772b9143d64f203dbd94d 100644 (file)
@@ -381,6 +381,7 @@ bsd/hfs/hfs_cnode.c                         optional hfs
 bsd/hfs/hfs_encodinghint.c                     standard
 bsd/hfs/hfs_encodings.c                                standard
 bsd/hfs/hfs_endian.c                           optional hfs
+bsd/hfs/hfs_fsinfo.c                           optional hfs
 bsd/hfs/hfs_hotfiles.c                         optional hfs
 bsd/hfs/hfs_link.c                             optional hfs
 bsd/hfs/hfs_lookup.c                           optional hfs
index 25f6d7c8e4d5156f6da257955b7a8bc80540eae0..dd02ad5026d4b620eec8e57ce1ad157ce37152ca 100644 (file)
@@ -20,7 +20,8 @@
  */
 
 /*
- * Portions copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Portions Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Portions Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -2583,9 +2584,10 @@ dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
 {
        dtrace_speculation_t *spec;
        dtrace_buffer_t *src, *dest;
-       uintptr_t daddr, saddr, dlimit;
+       uintptr_t daddr, saddr, dlimit, slimit;
        dtrace_speculation_state_t current,  new = DTRACESPEC_INACTIVE;
        intptr_t offs;
+       uint64_t timestamp;
 
        if (which == 0)
                return;
@@ -2661,7 +2663,38 @@ dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
        }
 
        /*
-        * We have the space; copy the buffer across.  (Note that this is a
+        * We have sufficient space to copy the speculative buffer into the
+        * primary buffer.  First, modify the speculative buffer, filling
+        * in the timestamp of all entries with the current time.  The data
+        * must have the commit() time rather than the time it was traced,
+        * so that all entries in the primary buffer are in timestamp order.
+        */
+       timestamp = dtrace_gethrtime();
+       saddr = (uintptr_t)src->dtb_tomax;
+       slimit = saddr + src->dtb_offset;
+       while (saddr < slimit) {
+               size_t size;
+               dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
+
+               if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
+                       saddr += sizeof (dtrace_epid_t);
+                       continue;
+               }
+
+               ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
+               size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
+
+               ASSERT(saddr + size <= slimit);
+               ASSERT(size >= sizeof(dtrace_rechdr_t));
+               ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
+
+               DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
+
+               saddr += size;
+       }
+
+       /*
+        * Copy the buffer across.  (Note that this is a
         * highly subobtimal bcopy(); in the unlikely event that this becomes
         * a serious performance issue, a high-performance DTrace-specific
         * bcopy() should obviously be invented.)
@@ -6119,8 +6152,23 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
                tomax = buf->dtb_tomax;
                ASSERT(tomax != NULL);
 
-               if (ecb->dte_size != 0)
-                       DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
+               /*
+                * Build and store the record header corresponding to the ECB.
+                */
+               if (ecb->dte_size != 0) {
+                       dtrace_rechdr_t dtrh;
+
+                       if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
+                               mstate.dtms_timestamp = dtrace_gethrtime();
+                               mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
+                       }
+
+                       ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
+
+                       dtrh.dtrh_epid = ecb->dte_epid;
+                       DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
+                       DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
+               }
 
                mstate.dtms_epid = ecb->dte_epid;
                mstate.dtms_present |= DTRACE_MSTATE_EPID;
@@ -6268,7 +6316,9 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
                                continue;
 
                        switch (act->dta_kind) {
-                       case DTRACEACT_SPECULATE:
+                       case DTRACEACT_SPECULATE: {
+                               dtrace_rechdr_t *dtrh = NULL;
+
                                ASSERT(buf == &state->dts_buffer[cpuid]);
                                buf = dtrace_speculation_buffer(state,
                                    cpuid, val);
@@ -6291,9 +6341,23 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
                                ASSERT(tomax != NULL);
 
                                if (ecb->dte_size != 0)
-                                       DTRACE_STORE(uint32_t, tomax, offs,
-                                           ecb->dte_epid);
-                               continue;
+                                       continue;
+
+                               ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
+                               dtrh = ((void *)(tomax + offs));
+                               dtrh->dtrh_epid = ecb->dte_epid;
+
+                               /*
+                                * When the speculation is committed, all of
+                                * the records in the speculative buffer will
+                                * have their timestamps set to the commit
+                                * time.  Until then, it is set to a sentinel
+                                * value, for debugability.
+                                */
+                               DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
+
+                               continue;
+                       }
 
                        case DTRACEACT_CHILL:
                                if (dtrace_priv_kernel_destructive(state))
@@ -9559,9 +9623,9 @@ dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
 
        /*
         * The default size is the size of the default action: recording
-        * the epid.
+        * the header.
         */
-       ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+       ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
        ecb->dte_alignment = sizeof (dtrace_epid_t);
 
        epid = state->dts_epid++;
@@ -9661,122 +9725,85 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb)
 static void
 dtrace_ecb_resize(dtrace_ecb_t *ecb)
 {
-       uint32_t maxalign = sizeof (dtrace_epid_t);
-       uint32_t align = sizeof (uint8_t), offs, diff;
        dtrace_action_t *act;
-       int wastuple = 0;
+       uint32_t curneeded = UINT32_MAX;
        uint32_t aggbase = UINT32_MAX;
-       dtrace_state_t *state = ecb->dte_state;
 
        /*
-        * If we record anything, we always record the epid.  (And we always
-        * record it first.)
+        * If we record anything, we always record the dtrace_rechdr_t.  (And
+        * we always record it first.)
         */
-       offs = sizeof (dtrace_epid_t);
-       ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+       ecb->dte_size = sizeof (dtrace_rechdr_t);
+       ecb->dte_alignment = sizeof (dtrace_epid_t);
 
        for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
                dtrace_recdesc_t *rec = &act->dta_rec;
+               ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
 
-               if ((align = rec->dtrd_alignment) > maxalign)
-                       maxalign = align;
-
-               if (!wastuple && act->dta_intuple) {
-                       /*
-                        * This is the first record in a tuple.  Align the
-                        * offset to be at offset 4 in an 8-byte aligned
-                        * block.
-                        */
-                       diff = offs + sizeof (dtrace_aggid_t);
-
-                       if ((diff = (diff & (sizeof (uint64_t) - 1))))
-                               offs += sizeof (uint64_t) - diff;
-
-                       aggbase = offs - sizeof (dtrace_aggid_t);
-                       ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
-               }
-
-               /*LINTED*/
-               if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
-                       /*
-                        * The current offset is not properly aligned; align it.
-                        */
-                       offs += align - diff;
-               }
-
-               rec->dtrd_offset = offs;
-
-               if (offs + rec->dtrd_size > ecb->dte_needed) {
-                       ecb->dte_needed = offs + rec->dtrd_size;
-
-                       if (ecb->dte_needed > state->dts_needed)
-                               state->dts_needed = ecb->dte_needed;
-               }
+               ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
 
                if (DTRACEACT_ISAGG(act->dta_kind)) {
                        dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
-                       dtrace_action_t *first = agg->dtag_first, *prev;
 
-                       ASSERT(rec->dtrd_size != 0 && first != NULL);
-                       ASSERT(wastuple);
+                       ASSERT(rec->dtrd_size != 0);
+                       ASSERT(agg->dtag_first != NULL);
+                       ASSERT(act->dta_prev->dta_intuple);
                        ASSERT(aggbase != UINT32_MAX);
+                       ASSERT(curneeded != UINT32_MAX);
 
                        agg->dtag_base = aggbase;
 
-                       while ((prev = first->dta_prev) != NULL &&
-                           DTRACEACT_ISAGG(prev->dta_kind)) {
-                               agg = (dtrace_aggregation_t *)prev;
-                               first = agg->dtag_first;
-                       }
+                       curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+                       rec->dtrd_offset = curneeded;
+                       curneeded += rec->dtrd_size;
+                       ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
 
-                       if (prev != NULL) {
-                               offs = prev->dta_rec.dtrd_offset +
-                                   prev->dta_rec.dtrd_size;
-                       } else {
-                               offs = sizeof (dtrace_epid_t);
+                       aggbase = UINT32_MAX;
+                       curneeded = UINT32_MAX;
+               } else if (act->dta_intuple) {
+                       if (curneeded == UINT32_MAX) {
+                               /*
+                                * This is the first record in a tuple.  Align
+                                * curneeded to be at offset 4 in an 8-byte
+                                * aligned block.
+                                */
+                               ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
+                               ASSERT(aggbase == UINT32_MAX);
+
+                               curneeded = P2PHASEUP(ecb->dte_size,
+                                   sizeof (uint64_t), sizeof (dtrace_aggid_t));
+
+                               aggbase = curneeded - sizeof (dtrace_aggid_t);
+                               ASSERT(IS_P2ALIGNED(aggbase,
+                                   sizeof (uint64_t)));
                        }
-                       wastuple = 0;
-               } else {
-                       if (!act->dta_intuple)
-                               ecb->dte_size = offs + rec->dtrd_size;
 
-                       offs += rec->dtrd_size;
+                       curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+                       rec->dtrd_offset = curneeded;
+                       curneeded += rec->dtrd_size;
+               } else {
+                       /* tuples must be followed by an aggregation */
+                       ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
+                       ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
+                       rec->dtrd_offset = ecb->dte_size;
+                       ecb->dte_size += rec->dtrd_size;
+                       ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
                }
-
-               wastuple = act->dta_intuple;
        }
 
        if ((act = ecb->dte_action) != NULL &&
            !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
-           ecb->dte_size == sizeof (dtrace_epid_t)) {
+           ecb->dte_size == sizeof (dtrace_rechdr_t)) {
                /*
-                * If the size is still sizeof (dtrace_epid_t), then all
+                * If the size is still sizeof (dtrace_rechdr_t), then all
                 * actions store no data; set the size to 0.
                 */
-               ecb->dte_alignment = maxalign;
                ecb->dte_size = 0;
-
-               /*
-                * If the needed space is still sizeof (dtrace_epid_t), then
-                * all actions need no additional space; set the needed
-                * size to 0.
-                */
-               if (ecb->dte_needed == sizeof (dtrace_epid_t))
-                       ecb->dte_needed = 0;
-
-               return;
        }
 
-       /*
-        * Set our alignment, and make sure that the dte_size and dte_needed
-        * are aligned to the size of an EPID.
-        */
-       ecb->dte_alignment = maxalign;
-       ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
-           ~(sizeof (dtrace_epid_t) - 1);
-       ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
-           ~(sizeof (dtrace_epid_t) - 1);
-       ASSERT(ecb->dte_size <= ecb->dte_needed);
+       ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
+       ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
+       ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
 }
 
 static dtrace_action_t *
@@ -10147,7 +10174,7 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
                        break;
 
                case DTRACEACT_SPECULATE:
-                       if (ecb->dte_size > sizeof (dtrace_epid_t))
+                       if (ecb->dte_size > sizeof (dtrace_rechdr_t))
                                return (EINVAL);
 
                        if (dp == NULL)
@@ -10260,7 +10287,7 @@ dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
 
        ecb->dte_action = NULL;
        ecb->dte_action_last = NULL;
-       ecb->dte_size = sizeof (dtrace_epid_t);
+       ecb->dte_size = 0;
 }
 
 static void
@@ -10534,11 +10561,13 @@ dtrace_buffer_switch(dtrace_buffer_t *buf)
        caddr_t tomax = buf->dtb_tomax;
        caddr_t xamot = buf->dtb_xamot;
        dtrace_icookie_t cookie;
+       hrtime_t now;
 
        ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
        ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
 
        cookie = dtrace_interrupt_disable();
+       now = dtrace_gethrtime();
        buf->dtb_tomax = xamot;
        buf->dtb_xamot = tomax;
        buf->dtb_xamot_drops = buf->dtb_drops;
@@ -10549,6 +10578,8 @@ dtrace_buffer_switch(dtrace_buffer_t *buf)
        buf->dtb_drops = 0;
        buf->dtb_errors = 0;
        buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
+       buf->dtb_interval = now - buf->dtb_switched;
+       buf->dtb_switched = now;
        dtrace_interrupt_enable(cookie);
 }
 
@@ -16617,6 +16648,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                        desc.dtbd_drops = buf->dtb_drops;
                        desc.dtbd_errors = buf->dtb_errors;
                        desc.dtbd_oldest = buf->dtb_xamot_offset;
+                       desc.dtbd_timestamp = dtrace_gethrtime();
 
                        lck_mtx_unlock(&dtrace_lock);
 
@@ -16669,6 +16701,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                desc.dtbd_drops = buf->dtb_xamot_drops;
                desc.dtbd_errors = buf->dtb_xamot_errors;
                desc.dtbd_oldest = 0;
+               desc.dtbd_timestamp = buf->dtb_switched;
 
                lck_mtx_unlock(&dtrace_lock);
 
index 0f3771a2204fa3a687b0410eb309bc5eda6acab3..e3898a3a1a2ea824b63dfd9b2e72a60b3cfa5d32 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -106,6 +106,9 @@ extern struct timezone gTimeZone;
 /* How many free extents to cache per volume */
 #define kMaxFreeExtents                10
 
+/* The maximum time hfs locks can be held while performing hfs statistics gathering */
+#define HFS_FSINFO_MAX_LOCKHELD_TIME   20 * 1000000ULL /* at most 20 milliseconds. */
+
 /*
  * HFS_MINFREE gives the minimum acceptable percentage
  * of file system blocks which may be free (but this
@@ -715,20 +718,6 @@ extern int hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp
 
 extern int hfs_bmap(struct vnode *, daddr_t, struct vnode **, daddr64_t *, unsigned int *);
 
-extern int hfs_fsync(struct vnode *, int, int, struct proc *);
-
-extern int hfs_access(struct vnode *, mode_t, kauth_cred_t, struct proc *);
-
-extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid);
-
-extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state);
-
-extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks);
-
-extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, 
-               u_int32_t numBlocks, u_int32_t *alloc_count);
-
-extern int hfs_isrbtree_active (struct hfsmount *hfsmp);
 extern errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock);
 
 
@@ -904,6 +893,7 @@ extern int hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp,
 
 extern int hfs_update(struct vnode *, int);
 
+extern int hfs_fsync(struct vnode *, int, int, struct proc *);
 
 /*****************************************************************************
        Functions from hfs_xattr.c
@@ -929,6 +919,9 @@ int hfs_getxattr_internal(cnode_t *, struct vnop_getxattr_args *,
 int hfs_xattr_write(vnode_t vp, const char *name, const void *data, size_t size);
 int hfs_setxattr_internal(struct cnode *, const void *, size_t, 
                           struct vnop_setxattr_args *, struct hfsmount *, u_int32_t);
+extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid);
+extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state);
+
 
 
 /*****************************************************************************
@@ -951,6 +944,23 @@ extern cnid_t  hfs_currentparent(cnode_t *cp);
 extern cnid_t  hfs_currentcnid(cnode_t *cp);
 
 
+/*****************************************************************************
+       Functions from VolumeAllocation.c
+ ******************************************************************************/
+extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock,
+                                                  u_int32_t numBlocks);
+
+extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock,
+                                                          u_int32_t numBlocks, u_int32_t *alloc_count);
+
+extern int hfs_isrbtree_active (struct hfsmount *hfsmp);
+
+/*****************************************************************************
+       Functions from hfs_fsinfo.c
+ ******************************************************************************/
+extern errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data);
+extern void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry);
+
 #endif /* __APPLE_API_PRIVATE */
 #endif /* KERNEL */
 #endif /* __HFS__ */
index c2f92f02a4bfe97399ee9b43b196ea90195ceba5..89589de2855cb5e5105e57a2068131ba7f9f4c2f 100644 (file)
@@ -2448,18 +2448,20 @@ hfs_unlock_truncate(struct cnode *cp, enum hfs_lockflags flags)
                vnode_t vp = NULL, rvp = NULL;
 
                /*
-                * Deal with any pending set sizes.  We need to call
-                * ubc_setsize before we drop the exclusive lock.  Ideally,
-                * hfs_unlock should be called before hfs_unlock_truncate but
-                * that's a lot to ask people to remember :-)
+                * If there are pending set sizes, the cnode lock should be dropped
+                * first.
                 */
+#if DEBUG
+               assert(!(cp->c_lockowner == thread
+                                && ISSET(cp->c_flag, C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE)));
+#elif DEVELOPMENT
                if (cp->c_lockowner == thread
                        && ISSET(cp->c_flag, C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE)) {
-                       // hfs_unlock will do the setsize calls for us
-                       hfs_unlock(cp);
-                       hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
+                       printf("hfs: hfs_unlock_truncate called with C_NEED_DATA/RSRC_SETSIZE set (caller: 0x%llx)\n",
+                                  (uint64_t)VM_KERNEL_UNSLIDE(__builtin_return_address(0)));
                }
+#endif
+
                if (cp->c_need_dvnode_put_after_truncate_unlock) {
                        vp = cp->c_vp;
                        cp->c_need_dvnode_put_after_truncate_unlock = false;
index f7f3c26b19d3e50bbef7cf4d840e31d4bc799c96..b90b722b5ca74340fdc680b1ca1e07bde44b5e1f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2014 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2004-2015 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -54,6 +54,7 @@ struct hfs_journal_info {
 };
 
 
+// Will be deprecated and replaced by hfs_fsinfo
 struct hfsinfo_metadata {
        uint32_t total;
        uint32_t extents;
@@ -64,6 +65,189 @@ struct hfsinfo_metadata {
        uint32_t reserved[4];
 };
 
+/*
+ * Flags for hfs_fsinfo_data structure
+ */
+#define HFS_FSINFO_CLASS_A      0x0001 /* Information for class A files requested */
+#define HFS_FSINFO_CLASS_B      0x0002 /* Information for class B files requested */
+#define HFS_FSINFO_CLASS_C      0x0004 /* Information for class C files requested */
+#define HFS_FSINFO_CLASS_D      0x0008 /* Information for class D files requested */
+
+/*
+ * Maximum number of buckets to represent range from 0 to 1TB (2^40) in
+ * increments of power of 2, and one catch-all bucket for anything that
+ * is greater than 1TB
+ */
+#define HFS_FSINFO_DATA_MAX_BUCKETS     42
+
+/*
+ * Maximum number of buckets to represents percentage range from 0 to 100
+ * in increments of 10.
+ */
+#define HFS_FSINFO_PERCENT_MAX_BUCKETS  10
+
+/*
+ * Maximum number of buckets to represent number of file/directory name characters
+ * (range 1 to 255) in increments of 5.
+ */
+#define HFS_FSINFO_NAME_MAX_BUCKETS     51
+
+/*
+ * Version number to ensure that the caller and the kernel have same understanding
+ * of the hfs_fsinfo_data structure.  This version needs to be bumped whenever the
+ * number of buckets is changed.
+ */
+#define HFS_FSINFO_VERSION              1
+
+/*
+ * hfs_fsinfo_data is generic data structure to aggregate information like sizes
+ * or counts in buckets of power of 2.  Each bucket represents a range of values
+ * that is determined based on its index in the array.  Specifically, buckets[i]
+ * represents values that are greater than or equal to 2^(i-1) and less than 2^i,
+ * except the last bucket which represents range greater than or equal to 2^(i-1)
+ *
+ * The current maximum number of buckets is 41, so we can represent range from
+ * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of
+ * anything that is greater than or equal to 1TB.
+ *
+ * For example,
+ * bucket[0]  -> greater than or equal to 0 and less than 1
+ * bucket[1]  -> greater than or equal to 1 and less than 2
+ * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024
+ * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB
+ * bucket[41] -> greater than or equal to 2^(41-1) = 1TB
+ *
+ * Note that fsctls that populate this data structure can take long time to
+ * execute as this operation can be I/O intensive (traversing btrees) and compute
+ * intensive.
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+
+/* 
+ * The header includes the user input fields.
+ */
+typedef struct hfs_fsinfo_header {
+       uint32_t request_type;
+       uint16_t version;
+       uint16_t flags;
+} hfs_fsinfo_header_t;
+
+struct hfs_fsinfo_data {
+       hfs_fsinfo_header_t header;
+       uint32_t                        bucket[HFS_FSINFO_DATA_MAX_BUCKETS];
+};
+
+/*
+ * Structure to represent information about metadata files
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+struct hfs_fsinfo_metadata {
+       hfs_fsinfo_header_t header;
+       uint32_t                        extents;
+       uint32_t                        catalog;
+       uint32_t                        allocation;
+       uint32_t                        attribute;
+       uint32_t                        journal;
+};
+
+/*
+ * Structure to represent distribution of number of file name characters
+ * in increments of 5s.  Each bucket represents a range of values that is
+ * determined based on its index in the array.  So bucket[i] represents values
+ * that are greater than or equal to (i*5) and less than ((i+1)*10).
+ *
+ * Since this structure represents range of file name characters and the
+ * maximum number of unicode characters in HFS+ is 255, the maximum number
+ * of buckets will be 52 [0..51].
+ *
+ * For example,
+ * bucket[4] -> greater than or equal to 20 and less than 25 characters
+ * bucket[51] -> equal to 255 characters
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+struct hfs_fsinfo_name {
+       hfs_fsinfo_header_t     header;
+       uint32_t                        bucket[HFS_FSINFO_NAME_MAX_BUCKETS];
+};
+
+/*
+ * Structure to represent information about content protection classes
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+struct hfs_fsinfo_cprotect {
+       hfs_fsinfo_header_t     header;
+       uint32_t class_A;
+       uint32_t class_B;
+       uint32_t class_C;
+       uint32_t class_D;
+       uint32_t class_E;
+       uint32_t class_F;
+};
+
+/*
+ * Union of all the different values returned by HFSIOC_FSINFO fsctl
+ */
+union hfs_fsinfo {
+       hfs_fsinfo_header_t                     header;
+       struct hfs_fsinfo_data          data;
+       struct hfs_fsinfo_metadata      metadata;
+       struct hfs_fsinfo_name          name;
+       struct hfs_fsinfo_cprotect cprotect;
+};
+typedef union hfs_fsinfo hfs_fsinfo;
+
+/*
+ * Type of FSINFO requested, specified by the caller in request_type field
+ */
+enum {
+       /* Information about number of allocation blocks for each metadata file, returns struct hfs_fsinfo_metadata */
+       HFS_FSINFO_METADATA_BLOCKS_INFO = 1,
+       
+       /* Information about number of extents for each metadata file, returns struct hfs_fsinfo_metadata */
+       HFS_FSINFO_METADATA_EXTENTS             = 2,
+       
+       /* Information about percentage of free nodes vs used nodes in metadata btrees, returns struct hfs_fsinfo_metadata */
+       HFS_FSINFO_METADATA_PERCENTFREE = 3,
+       
+       /* Distribution of number of extents for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */
+       HFS_FSINFO_FILE_EXTENT_COUNT    = 4,
+       
+       /* Distribution of extent sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */
+       HFS_FSINFO_FILE_EXTENT_SIZE             = 5,
+       
+       /* Distribution of file sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */
+       HFS_FSINFO_FILE_SIZE                    = 6,
+
+       /* Distribution of valence for all directories, returns struct hfs_fsinfo_data */
+       HFS_FSINFO_DIR_VALENCE                  = 7,
+       
+       /* Distribution of file/directory name size in unicode characters, returns struct hfs_fsinfo_name */
+       HFS_FSINFO_NAME_SIZE                    = 8,
+       
+       /* Distribution of extended attribute sizes, returns hfs_fsinfo_data */
+       HFS_FSINFO_XATTR_SIZE                   = 9,
+       
+       /* Distribution of free space for the entire file system, returns struct hfs_fsinfo_data */
+       HFS_FSINFO_FREE_EXTENTS                 = 10,
+
+       /* Information about number of files belonging to each class, returns hfs_fsinfo_cprotect */
+       HFS_FSINFO_FILE_CPROTECT_COUNT  = 11,
+
+       /*
+        * Distribution of symbolic link sizes for data files (data fork, no rsrc fork, no xattr),
+        * returns struct hfs_fsinfo_data
+        */
+       HFS_FSINFO_SYMLINK_SIZE                 = 12,
+};
+
 
 /* HFS FS CONTROL COMMANDS */
 
@@ -166,6 +350,8 @@ struct hfsinfo_metadata {
 
 
 /* 
+ * XXX: Will be deprecated and replaced by HFSIOC_GET_FSINFO
+ *
  * Get information about number of file system allocation blocks used by metadata 
  * files on the volume, including individual btrees and journal file.  The caller 
  * can determine the size of file system allocation block using value returned as 
@@ -178,6 +364,10 @@ struct hfsinfo_metadata {
 #define HFSIOC_CS_FREESPACE_TRIM _IOWR('h', 39, u_int32_t)
 #define HFS_CS_FREESPACE_TRIM    IOCBASECMD(HFSIOC_CS_FREESPACE_TRIM)
 
+/* Get file system information for the given volume */
+#define HFSIOC_GET_FSINFO        _IOWR('h', 45, hfs_fsinfo)
+#define HFS_GET_FSINFO           IOCBASECMD(HFSIOC_GET_FSINFO)
+
 #endif /* __APPLE_API_UNSTABLE */
 
 #endif /* ! _HFS_FSCTL_H_ */
diff --git a/bsd/hfs/hfs_fsinfo.c b/bsd/hfs/hfs_fsinfo.c
new file mode 100644 (file)
index 0000000..d307108
--- /dev/null
@@ -0,0 +1,891 @@
+/*
+ * Copyright (c) 2014-2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <sys/cprotect.h>
+#include <sys/xattr.h>
+#include <sys/utfconv.h>
+#include <libkern/OSByteOrder.h>
+#include <kern/kalloc.h>
+#include <sys/stat.h>
+
+#include "hfs.h"
+#include "hfs_fsctl.h"
+#include "hfs_endian.h"
+#include "hfscommon/headers/BTreesInternal.h"
+#include "hfscommon/headers/BTreesPrivate.h"
+#include "hfscommon/headers/FileMgrInternal.h"
+
+#if CONFIG_PROTECT
+#include <hfs/hfs_cprotect.h>
+#endif
+
+
+union HFSPlusRecord {
+       HFSPlusCatalogFolder folder_record;
+       HFSPlusCatalogFile file_record;
+       HFSPlusCatalogThread thread_record;
+       HFSPlusExtentRecord extent_record;
+       HFSPlusAttrRecord attr_record;
+}; 
+typedef union HFSPlusRecord HFSPlusRecord;
+
+union HFSPlusKey {
+       HFSPlusExtentKey extent_key;
+       HFSPlusAttrKey attr_key;
+};
+typedef union HFSPlusKey HFSPlusKey;
+
+typedef enum traverse_btree_flag {
+       
+       //If set, extents btree will also be traversed along with catalog btree, so grab correct locks upfront
+       TRAVERSE_BTREE_EXTENTS = 1,
+
+       // Getting content-protection attributes, allocate enough space to accomodate the records.
+       TRAVERSE_BTREE_XATTR_CPROTECT = 2,
+       
+} traverse_btree_flag_t;
+
+
+
+static errno_t hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo);
+static errno_t hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo);
+static errno_t hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo);
+static errno_t fsinfo_file_extent_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_file_extent_size_catalog_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_file_extent_size_overflow_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_file_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_dir_valence_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_name_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_xattr_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, traverse_btree_flag_t flags, void *fsinfo,
+               int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *));
+static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo);
+static void fsinfo_free_extents_callback(void *data, off_t free_extent_size);
+#if CONFIG_PROTECT
+static errno_t fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+#endif
+static errno_t fsinfo_symlink_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+
+/* 
+ * Entry function for all the fsinfo requests from hfs_vnop_ioctl() 
+ * Depending on the type of request, this function will call the 
+ * appropriate sub-function and return success or failure back to 
+ * the caller.
+ */
+__private_extern__
+errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data)
+{
+       int error = 0;
+       hfs_fsinfo *fsinfo_union;
+       uint32_t request_type;
+       uint32_t header_len = sizeof(hfs_fsinfo_header_t);
+
+       fsinfo_union = (hfs_fsinfo *)a_data;
+       request_type = fsinfo_union->header.request_type;
+
+       // Zero out output fields to fsinfo_union, keep the user input fields intact.
+       bzero((char *)fsinfo_union + header_len, sizeof(hfs_fsinfo) - header_len);
+
+       switch (request_type) {
+               case HFS_FSINFO_METADATA_BLOCKS_INFO:
+                       error = hfs_fsinfo_metadata_blocks(hfsmp, &(fsinfo_union->metadata));
+                       break;
+
+               case HFS_FSINFO_METADATA_EXTENTS:
+                       error = hfs_fsinfo_metadata_extents(hfsmp, &(fsinfo_union->metadata));
+                       break;
+
+               case HFS_FSINFO_METADATA_PERCENTFREE:
+                       error = hfs_fsinfo_metadata_percentfree(hfsmp, &(fsinfo_union->metadata));
+                       break;
+
+               case HFS_FSINFO_FILE_EXTENT_COUNT:
+                       /* Traverse catalog btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, TRAVERSE_BTREE_EXTENTS, &(fsinfo_union->data), fsinfo_file_extent_count_callback);
+                       break;
+
+               case HFS_FSINFO_FILE_EXTENT_SIZE:
+                       /* Traverse the catalog btree first */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_catalog_callback);
+                       if (error) {
+                               break;
+                       }
+                       /* Traverse the overflow extents btree now */
+                       error = traverse_btree(hfsmp, kHFSExtentsFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_overflow_callback);
+                       break;
+
+               case HFS_FSINFO_FILE_SIZE:
+                       /* Traverse catalog btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_size_callback);
+                       break;
+
+               case HFS_FSINFO_DIR_VALENCE:
+                       /* Traverse catalog btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_dir_valence_callback);
+                       break;
+
+               case HFS_FSINFO_NAME_SIZE:
+                       /* Traverse catalog btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->name), &fsinfo_name_size_callback);
+                       break;
+
+               case HFS_FSINFO_XATTR_SIZE:
+                       /* Traverse attribute btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSAttributesFileID, 0, &(fsinfo_union->data), &fsinfo_xattr_size_callback);
+                       break;
+
+               case HFS_FSINFO_FREE_EXTENTS:
+                       error = hfs_fsinfo_free_extents(hfsmp, &(fsinfo_union->data));
+                       break;
+
+               case HFS_FSINFO_SYMLINK_SIZE:
+                       /* Traverse catalog btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_symlink_size_callback);
+                       break;
+
+#if CONFIG_PROTECT
+               case HFS_FSINFO_FILE_CPROTECT_COUNT:
+                       /* Traverse attribute btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSAttributesFileID, TRAVERSE_BTREE_XATTR_CPROTECT, &(fsinfo_union->cprotect), &fsinfo_cprotect_count_callback);
+                       break;
+#endif
+
+               default:
+                       return ENOTSUP;
+       };
+
+       return error;
+}
+
+/* 
+ * This function provides information about total number of allocation blocks 
+ * for each individual metadata file.
+ */
+static errno_t
+hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo)
+{
+       int lockflags = 0;
+       int ret_lockflags = 0;
+
+       /* 
+        * Getting number of allocation blocks for all metadata files 
+        * should be a relatively quick operation, so we grab locks for all
+        * the btrees at the same time
+        */
+       lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE;
+       ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+
+       /* Get information about all the btrees */
+       fsinfo->extents    = hfsmp->hfs_extents_cp->c_datafork->ff_blocks;
+       fsinfo->catalog    = hfsmp->hfs_catalog_cp->c_datafork->ff_blocks;
+       fsinfo->allocation = hfsmp->hfs_allocation_cp->c_datafork->ff_blocks;
+       if (hfsmp->hfs_attribute_cp)
+               fsinfo->attribute  = hfsmp->hfs_attribute_cp->c_datafork->ff_blocks;
+       else
+               fsinfo->attribute = 0;
+
+       /* Done with btrees, give up the locks */
+       hfs_systemfile_unlock(hfsmp, ret_lockflags);
+
+       /* Get information about journal file */
+       fsinfo->journal = howmany(hfsmp->jnl_size, hfsmp->blockSize);
+
+       return 0;
+}
+
+/* 
+ * Helper function to count the number of valid extents in a file fork structure
+ */
+static uint32_t
+hfs_count_extents_fp(struct filefork *ff)
+{
+       int i;
+       uint32_t count = 0;
+       for (i = 0; i < kHFSPlusExtentDensity; i++) {
+               if (ff->ff_data.cf_extents[i].blockCount == 0) {
+                       break;
+               }
+               count++;
+       }
+       return count;
+}
+
+
+/* 
+ * This is a helper function that counts the total number of valid 
+ * extents in all the overflow extent records for given fileID 
+ * in overflow extents btree
+ */
+static errno_t
+hfs_count_overflow_extents(struct hfsmount *hfsmp, uint32_t fileID, uint32_t *num_extents)
+{
+       int error;
+       FCB *fcb;
+       struct BTreeIterator *iterator = NULL;
+       FSBufferDescriptor btdata;
+       HFSPlusExtentKey *extentKey;
+       HFSPlusExtentRecord extentData;
+       uint32_t extent_count = 0;
+       int i;
+
+       fcb = VTOF(hfsmp->hfs_extents_vp);
+       MALLOC(iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK | M_ZERO);
+       
+       extentKey = (HFSPlusExtentKey *) &iterator->key;        
+       extentKey->keyLength = kHFSPlusExtentKeyMaximumLength;
+       extentKey->forkType = kHFSDataForkType;
+       extentKey->fileID = fileID;
+       extentKey->startBlock = 0;
+
+       btdata.bufferAddress = &extentData;
+       btdata.itemSize = sizeof(HFSPlusExtentRecord);
+       btdata.itemCount = 1;
+
+       /* Search for overflow extent record */
+       error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator);
+       
+       /*
+        * We used startBlock of zero, so we will not find any records and errors
+        * are expected.  It will also position the iterator just before the first 
+        * overflow extent record for given fileID (if any). 
+        */
+       if (error && error != fsBTRecordNotFoundErr && error != fsBTEndOfIterationErr)
+                       goto out;
+       error = 0;
+
+       for (;;) {
+               
+               if (msleep(NULL, NULL, PINOD | PCATCH,
+                                  "hfs_fsinfo", NULL) == EINTR) {
+                       error = EINTR;
+                       break;
+               }
+               
+               error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
+               if (error != 0) {
+                       /* These are expected errors, so mask them */
+                       if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
+                               error = 0;
+                       }
+                       break;
+               }
+
+               /* If we encounter different fileID, stop the iteration */
+               if (extentKey->fileID != fileID) {
+                       break;
+               }
+               
+               if (extentKey->forkType != kHFSDataForkType)
+                       break;
+               
+               /* This is our record of interest; only count the datafork extents. */
+               for (i = 0; i < kHFSPlusExtentDensity; i++) {
+                       if (extentData[i].blockCount == 0) {
+                               break;
+                       }
+                       extent_count++;
+               }
+       }
+
+out:
+       FREE(iterator, M_TEMP);
+
+       if (error == 0) {
+               *num_extents = extent_count;
+       }
+       return MacToVFSError(error);
+}
+
+/*
+ * This function provides information about total number of extents (including 
+ * extents from overflow extents btree, if any) for each individual metadata 
+ * file.
+ */
+static errno_t
+hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo)
+{
+       int error = 0;
+       int lockflags = 0;
+       int ret_lockflags = 0;
+       uint32_t overflow_count;
+
+       /*
+        * Counting the number of extents for all metadata files should
+        * be a relatively quick operation, so we grab locks for all the
+        * btrees at the same time
+        */
+       lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE;
+       ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+
+       /* Get number of extents for extents overflow btree */
+       fsinfo->extents = hfs_count_extents_fp(hfsmp->hfs_extents_cp->c_datafork);
+
+       /* Get number of extents for catalog btree */
+       fsinfo->catalog = hfs_count_extents_fp(hfsmp->hfs_catalog_cp->c_datafork);
+       if (fsinfo->catalog >= kHFSPlusExtentDensity) {
+               error = hfs_count_overflow_extents(hfsmp, kHFSCatalogFileID, &overflow_count);
+               if (error) {
+                       goto out;
+               }
+               fsinfo->catalog += overflow_count;
+       }
+
+       /* Get number of extents for allocation file */
+       fsinfo->allocation = hfs_count_extents_fp(hfsmp->hfs_allocation_cp->c_datafork);
+       if (fsinfo->allocation >= kHFSPlusExtentDensity) {
+               error = hfs_count_overflow_extents(hfsmp, kHFSAllocationFileID, &overflow_count);
+               if (error) {
+                       goto out;
+               }
+               fsinfo->allocation += overflow_count;
+       }
+
+       /*
+        * Get number of extents for attribute btree.
+        *      hfs_attribute_cp might be NULL.
+        */
+       if (hfsmp->hfs_attribute_cp) {
+               fsinfo->attribute = hfs_count_extents_fp(hfsmp->hfs_attribute_cp->c_datafork);
+               if (fsinfo->attribute >= kHFSPlusExtentDensity) {
+                       error = hfs_count_overflow_extents(hfsmp, kHFSAttributesFileID, &overflow_count);
+                       if (error) {
+                               goto out;
+                       }
+                       fsinfo->attribute += overflow_count;
+               }
+       }
+       /* Journal always has one extent */
+       fsinfo->journal = 1;
+out:
+       hfs_systemfile_unlock(hfsmp, ret_lockflags);
+       return error;
+}
+
+/* 
+ * Helper function to calculate percentage i.e. X is what percent of Y?
+ */
+static inline uint32_t 
+hfs_percent(uint32_t X, uint32_t Y)
+{
+       return (X * 100ll) / Y;
+}
+
+/*
+ * This function provides percentage of free nodes vs total nodes for each 
+ * individual metadata btrees, i.e. for catalog, overflow extents and 
+ * attributes btree.  This information is not applicable for allocation 
+ * file and journal file.
+ */
+static errno_t
+hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo)
+{
+       int lockflags = 0;
+       int ret_lockflags = 0;
+       BTreeControlBlockPtr btreePtr;
+       uint32_t free_nodes, total_nodes;
+
+       /*
+        * Getting total and used nodes for all metadata btrees should 
+        * be a relatively quick operation, so we grab locks for all the
+        * btrees at the same time
+        */
+       lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE;
+       ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+       
+       /* Overflow extents btree */
+       btreePtr = VTOF(hfsmp->hfs_extents_vp)->fcbBTCBPtr;
+       total_nodes = btreePtr->totalNodes;
+       free_nodes = btreePtr->freeNodes;
+       fsinfo->extents = hfs_percent(free_nodes, total_nodes);
+
+       /* Catalog btree */
+       btreePtr = VTOF(hfsmp->hfs_catalog_vp)->fcbBTCBPtr;
+       total_nodes = btreePtr->totalNodes;
+       free_nodes = btreePtr->freeNodes;
+       fsinfo->catalog = hfs_percent(free_nodes, total_nodes);
+
+       /* Attributes btree */
+       if (hfsmp->hfs_attribute_vp) {
+               btreePtr = VTOF(hfsmp->hfs_attribute_vp)->fcbBTCBPtr;
+               total_nodes = btreePtr->totalNodes;
+               free_nodes = btreePtr->freeNodes;
+               fsinfo->attribute = hfs_percent(free_nodes, total_nodes);
+       }
+
+       hfs_systemfile_unlock(hfsmp, ret_lockflags);
+       return 0;
+}
+
+/* 
+ * Helper function to calculate log base 2 for given number 
+ */
+static inline int 
+hfs_log2(uint64_t entry) 
+{
+       return (63 - __builtin_clzll(entry|1));
+}
+
+/*
+ * Helper function to account for input entry into the data 
+ * array based on its log base 2 value
+ */
+__private_extern__
+void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry)
+{
+       /* 
+        * From hfs_fsctl.h - 
+        *
+        * hfs_fsinfo_data is generic data structure to aggregate information like sizes 
+        * or counts in buckets of power of 2.  Each bucket represents a range of values 
+        * that is determined based on its index in the array.  Specifically, buckets[i] 
+        * represents values that are greater than or equal to 2^(i-1) and less than 2^i, 
+        * except the last bucket which represents range greater than or equal to 2^(i-1)
+        *
+        * The current maximum number of buckets is 41, so we can represent range from
+        * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of 
+        * anything that is greater than or equal to 1TB.
+        *
+        * For example, 
+        * bucket[0]  -> greater than or equal to 0 and less than 1
+        * bucket[1]  -> greater than or equal to 1 and less than 2
+        * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024
+        * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB
+        * bucket[41] -> greater than or equal to 2^(41-1) = 1TB
+        */
+       uint32_t bucket;
+
+       if (entry) {
+               /* 
+                * Calculate log base 2 value for the entry.
+                * Account for this value in the appropriate bucket.
+                * The last bucket is a catch-all bucket of
+                * anything that is greater than or equal to 1TB
+                */
+               bucket = MIN(hfs_log2(entry) + 1, HFS_FSINFO_DATA_MAX_BUCKETS-1);
+               ++fsinfo->bucket[bucket];
+       } else {
+               /* Entry is zero, so account it in 0th offset */
+               fsinfo->bucket[0]++;
+       }
+}
+
+/* 
+ * Function to traverse all the records of a btree and then call caller-provided 
+ * callback function for every record found.  The type of btree is chosen based 
+ * on the fileID provided by the caller.  This fuction grabs the correct locks 
+ * depending on the type of btree it will be traversing and flags provided 
+ * by the caller.
+ *
+ * Note: It might drop and reacquire the locks during execution.
+ */
+static errno_t
+traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, traverse_btree_flag_t flags,
+                          void *fsinfo, int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *))
+{
+       int error = 0;
+       int lockflags = 0;
+       int ret_lockflags = 0;
+       FCB *fcb;
+       struct BTreeIterator *iterator = NULL;
+       struct FSBufferDescriptor btdata;
+       int btree_operation;
+       HFSPlusRecord record;
+       HFSPlusKey *key;
+       uint64_t start, timeout_abs;
+
+       switch(btree_fileID) {
+               case kHFSExtentsFileID: 
+                       fcb = VTOF(hfsmp->hfs_extents_vp);
+                       lockflags = SFL_EXTENTS;
+                       break;
+               case kHFSCatalogFileID:
+                       fcb = VTOF(hfsmp->hfs_catalog_vp);
+                       lockflags = SFL_CATALOG;
+                       break;
+               case kHFSAttributesFileID:
+                       // Attributes file doesn’t exist, There are no records to iterate.
+                       if (hfsmp->hfs_attribute_vp == NULL)
+                               return error;
+                       fcb = VTOF(hfsmp->hfs_attribute_vp);
+                       lockflags = SFL_ATTRIBUTE;
+                       break;
+
+               default:
+                       return EINVAL;
+       }
+
+       MALLOC(iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK | M_ZERO);
+
+       /* The key is initialized to zero because we are traversing entire btree */
+       key = (HFSPlusKey *)&iterator->key;
+
+       if (flags & TRAVERSE_BTREE_EXTENTS) {
+               lockflags |= SFL_EXTENTS;
+       }
+
+       btdata.bufferAddress = &record;
+       btdata.itemSize = sizeof(HFSPlusRecord);
+       btdata.itemCount = 1;
+
+       /* Lock btree for duration of traversal */
+       ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+       btree_operation = kBTreeFirstRecord;
+
+       nanoseconds_to_absolutetime(HFS_FSINFO_MAX_LOCKHELD_TIME, &timeout_abs);
+       start = mach_absolute_time();
+
+       while (1) {
+
+               if (msleep(NULL, NULL, PINOD | PCATCH,
+                                  "hfs_fsinfo", NULL) == EINTR) {
+                       error = EINTR;
+                       break;
+               }
+
+               error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
+               if (error != 0) {
+                       if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
+                               error = 0;
+                       }
+                       break;
+               }
+               /* Lookup next btree record on next call to BTIterateRecord() */
+               btree_operation = kBTreeNextRecord;
+
+               /* Call our callback function and stop iteration if there are any errors */
+               error = callback(hfsmp, key, &record, fsinfo);
+               if (error) {
+                       break;
+               }
+
+               /* let someone else use the tree after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME */
+               if ((mach_absolute_time() - start) >= timeout_abs) {
+
+                       /* release b-tree locks and let someone else get the lock */
+                       hfs_systemfile_unlock (hfsmp, ret_lockflags);
+
+                       /* add tsleep here to force context switch and fairness */
+                       tsleep((caddr_t)hfsmp, PRIBIO, "hfs_fsinfo", 1);
+
+                       /*
+                        * re-acquire the locks in the same way that we wanted them originally.
+                        * note: it is subtle but worth pointing out that in between the time that we
+                        * released and now want to re-acquire these locks that the b-trees may have shifted
+                        * slightly but significantly. For example, the catalog or other b-tree could have grown
+                        * past 8 extents and now requires the extents lock to be held in order to be safely
+                        * manipulated. We can't be sure of the state of the b-tree from where we last left off.
+                        */
+
+                       ret_lockflags = hfs_systemfile_lock (hfsmp, lockflags, HFS_SHARED_LOCK);
+
+                       /*
+                        * It's highly likely that the search key we stashed away before dropping lock
+                        * no longer points to an existing item.  Iterator's IterateRecord is able to
+                        * re-position itself and process the next record correctly.  With lock dropped,
+                        * there might be records missed for statistic gathering, which is ok. The
+                        * point is to get aggregate values.
+                        */
+
+                       start = mach_absolute_time();
+
+                       /* loop back around and get another record */
+               }
+       }
+
+       hfs_systemfile_unlock(hfsmp, ret_lockflags);
+       FREE (iterator, M_TEMP);
+       return MacToVFSError(error);
+}
+
+/* 
+ * Callback function to get distribution of number of extents 
+ * for all user files in given file system.  Note that this only 
+ * accounts for data fork, no resource fork. 
+ */
+static errno_t
+fsinfo_file_extent_count_callback(struct hfsmount *hfsmp, 
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       int i;
+       int error = 0;
+       uint32_t num_extents = 0;
+       uint32_t num_overflow = 0;
+       uint32_t blockCount;
+
+       if (record->file_record.recordType == kHFSPlusFileRecord) {
+               /* Count total number of extents for this file */
+               for (i = 0; i < kHFSPlusExtentDensity; i++) {
+                       blockCount = record->file_record.dataFork.extents[i].blockCount;
+                       if (blockCount == 0) {
+                               break;
+                       }
+                       num_extents++;
+               }
+               /* This file has overflow extent records, so search overflow btree */
+               if (num_extents >= kHFSPlusExtentDensity) {
+                       /* The caller also hold extents overflow btree lock */
+                       error = hfs_count_overflow_extents(hfsmp, record->file_record.fileID, &num_overflow);
+                       if (error) {
+                               goto out;
+                       }
+                       num_extents += num_overflow;
+               }
+               hfs_fsinfo_data_add(data, num_extents);
+       }
+out:
+       return error;
+}
+
+/* 
+ * Callback function to get distribution of individual extent sizes
+ * (in bytes) for all user files in given file system from catalog 
+ * btree only.  Note that this only accounts for data fork, no resource 
+ * fork. 
+ */
+static errno_t fsinfo_file_extent_size_catalog_callback(__unused struct hfsmount *hfsmp,
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       int i;
+       uint32_t blockCount;
+       uint64_t extent_size;
+
+       if (record->file_record.recordType == kHFSPlusFileRecord) {
+               /* Traverse through all valid extents */
+               for (i = 0; i < kHFSPlusExtentDensity; i++) {
+                       blockCount = record->file_record.dataFork.extents[i].blockCount;
+                       if (blockCount == 0) {
+                               break;
+                       }
+                       extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize);
+                       hfs_fsinfo_data_add(data, extent_size);
+               }
+       }
+       return 0;
+}
+
+/* 
+ * Callback function to get distribution of individual extent sizes
+ * (in bytes) for all user files in given file system from overflow 
+ * extents btree only.  Note that this only accounts for data fork, 
+ * no resource fork. 
+ */
+static errno_t fsinfo_file_extent_size_overflow_callback(__unused struct hfsmount *hfsmp,
+               HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       int i;
+       uint32_t blockCount;
+       uint64_t extent_size;
+
+       if (key->extent_key.fileID >= kHFSFirstUserCatalogNodeID) {
+               // Only count the data fork extents.
+               if (key->extent_key.forkType == kHFSDataForkType) {
+                       for (i = 0; i < kHFSPlusExtentDensity; i++) {
+                               blockCount = record->extent_record[i].blockCount;
+                               if (blockCount == 0) {
+                                       break;
+                               }
+                               extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize);
+                               hfs_fsinfo_data_add(data, extent_size);
+                       }
+               }
+       }
+       return 0;
+}
+
+/* 
+ * Callback function to get distribution of file sizes (in bytes) 
+ * for all user files in given file system.  Note that this only 
+ * accounts for data fork, no resource fork. 
+ */
+static errno_t fsinfo_file_size_callback(__unused struct hfsmount *hfsmp,
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       if (record->file_record.recordType == kHFSPlusFileRecord) {
+               /* Record of interest, account for the size in the bucket */
+               hfs_fsinfo_data_add(data, record->file_record.dataFork.logicalSize);
+       }
+       return 0;
+}
+
+/*
+ * Callback function to get distribution of directory valence 
+ * for all directories in the given file system.
+ */
+static errno_t fsinfo_dir_valence_callback(__unused struct hfsmount *hfsmp,
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       if (record->folder_record.recordType == kHFSPlusFolderRecord) {
+               hfs_fsinfo_data_add(data, record->folder_record.valence);
+       }
+       return 0;
+}
+
+/* 
+ * Callback function to get distribution of number of unicode 
+ * characters in name for all files and directories for a given 
+ * file system.
+ */
+static errno_t fsinfo_name_size_callback(__unused struct hfsmount *hfsmp,
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       struct hfs_fsinfo_name *fsinfo = (struct hfs_fsinfo_name *)data;
+       uint32_t length;
+
+       if ((record->folder_record.recordType == kHFSPlusFolderThreadRecord) ||
+           (record->folder_record.recordType == kHFSPlusFileThreadRecord)) {
+               length = record->thread_record.nodeName.length;
+               /* Make sure that the nodeName is bounded, otherwise return error */
+               if (length > kHFSPlusMaxFileNameChars) {
+                       return EIO;
+               }
+               
+               // sanity check for a name length of zero, which isn't valid on disk.
+               if (length == 0)
+                       return EIO;
+               
+               /* Round it down to nearest multiple of 5 to match our buckets granularity */
+               length = (length - 1)/ 5;
+               /* Account this value into our bucket */
+               fsinfo->bucket[length]++;
+       }
+       return 0;
+}
+
+/* 
+ * Callback function to get distribution of size of all extended 
+ * attributes for a given file system.
+ */
+static errno_t fsinfo_xattr_size_callback(__unused struct hfsmount *hfsmp,
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       if (record->attr_record.recordType == kHFSPlusAttrInlineData) {
+               /* Inline attribute */
+               hfs_fsinfo_data_add(data, record->attr_record.attrData.attrSize);
+       } else if (record->attr_record.recordType == kHFSPlusAttrForkData) {
+               /* Larger attributes with extents information */
+               hfs_fsinfo_data_add(data, record->attr_record.forkData.theFork.logicalSize);
+       }
+       return 0;
+}
+
+
+/*
+ * Callback function to get distribution of free space extents for a given file system.
+ */
+static void fsinfo_free_extents_callback(void *data, off_t free_extent_size)
+{
+       // Assume a minimum of 4 KB block size
+       hfs_fsinfo_data_add(data, free_extent_size / 4096);
+}
+
+/*
+ * Function to get distribution of free space extents for a given file system.
+ */
+static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo)
+{
+       return hfs_find_free_extents(hfsmp, &fsinfo_free_extents_callback, fsinfo);
+}
+
+/*
+ * Callback function to get distribution of symblock link sizes (in bytes)
+ * for all user files in given file system.  Note that this only
+ * accounts for data fork, no resource fork.
+ */
+static errno_t fsinfo_symlink_size_callback(__unused struct hfsmount *hfsmp,
+                                                                        __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       if (record->file_record.recordType == kHFSPlusFileRecord) {
+               /* Record of interest, account for the size in the bucket */
+               if (S_ISLNK(record->file_record.bsdInfo.fileMode))
+                       hfs_fsinfo_data_add((struct hfs_fsinfo_data *)data, record->file_record.dataFork.logicalSize);
+       }
+       return 0;
+}
+
+#if CONFIG_PROTECT
+/*
+ * Callback function to get total number of files/directories
+ * for each content protection class
+ */
+static int fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key,
+                                                                                 HFSPlusRecord *record, void *data)
+{
+       struct hfs_fsinfo_cprotect *fsinfo = (struct hfs_fsinfo_cprotect *)data;
+       static const uint16_t cp_xattrname_utf16[] = CONTENT_PROTECTION_XATTR_NAME_CHARS;
+       static const size_t cp_xattrname_utf16_len = sizeof(cp_xattrname_utf16)/2;
+       struct cp_xattr_v5 *xattr;
+       size_t xattr_len = sizeof(struct cp_xattr_v5);
+       struct cprotect cp_entry;
+       struct cprotect *cp_entryp = &cp_entry;
+       int error = 0;
+
+       /* Content protect xattrs are inline attributes only, so skip all others */
+       if (record->attr_record.recordType != kHFSPlusAttrInlineData)
+               return 0;
+
+       /* We only look at content protection xattrs */
+       if ((key->attr_key.attrNameLen != cp_xattrname_utf16_len) ||
+               (bcmp(key->attr_key.attrName, cp_xattrname_utf16, cp_xattrname_utf16_len))) {
+               return 0;
+       }
+
+       xattr = (struct cp_xattr_v5 *)((void *)(record->attr_record.attrData.attrData));
+       error = cp_read_xattr_v5(hfsmp, xattr, xattr_len, (cprotect_t *)&cp_entryp,
+                                                        CP_GET_XATTR_BASIC_INFO);
+       if (error)
+               return 0;
+
+       /* No key present, skip this record */
+       if (!ISSET(cp_entry.cp_flags, CP_HAS_A_KEY))
+               return 0;
+
+       /* Now account for the persistent class */
+       switch (CP_CLASS(cp_entry.cp_pclass)) {
+               case PROTECTION_CLASS_A:
+                       fsinfo->class_A++;
+                       break;
+               case PROTECTION_CLASS_B:
+                       fsinfo->class_B++;
+                       break;
+               case PROTECTION_CLASS_C:
+                       fsinfo->class_C++;
+                       break;
+               case PROTECTION_CLASS_D:
+                       fsinfo->class_D++;
+                       break;
+               case PROTECTION_CLASS_E:
+                       fsinfo->class_E++;
+                       break;
+               case PROTECTION_CLASS_F:
+                       fsinfo->class_F++;
+                       break;
+       };
+
+       return 0;
+}
+#endif
index 96e8c20ed3fb2bb1835ebf0ab6542fe1a7f88b91..f09bdc7d2609867889becac6b21bbc0c6795b79c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -1578,7 +1578,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
 int
 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                vnode_t a_vp;
-               int  a_command;
+               long  a_command;
                caddr_t  a_data;
                int  a_fflag;
                vfs_context_t a_context;
@@ -2654,6 +2654,37 @@ fail_change_next_allocation:
                break;
        }
 
+       case HFS_GET_FSINFO: {
+               hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
+
+               /* Only root is allowed to get fsinfo */
+               if (!kauth_cred_issuser(kauth_cred_get())) {
+                       return EACCES;
+               }
+
+               /*
+                * Make sure that the caller's version number matches with
+                * the kernel's version number.  This will make sure that
+                * if the structures being read/written into are changed
+                * by the kernel, the caller will not read incorrect data.
+                *
+                * The first three fields --- request_type, version and
+                * flags are same for all the hfs_fsinfo structures, so
+                * we can access the version number by assuming any
+                * structure for now.
+                */
+               if (fsinfo->header.version != HFS_FSINFO_VERSION) {
+                       return ENOTSUP;
+               }
+
+               /* Make sure that the current file system is not marked inconsistent */
+               if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
+                       return EIO;
+               }
+
+               return hfs_get_fsinfo(hfsmp, ap->a_data);
+       }
+
        case HFS_CS_FREESPACE_TRIM: {
                int error = 0;
                int lockflags = 0;
index 5bfc09c3e8f1678fd1367cf111c5f34251cdd952..0c327a7922f7f2e11e9695514b28590a47a43a46 100644 (file)
@@ -3655,8 +3655,8 @@ relock:
         * truncate lock)
         */
 rm_done:
-       hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
        hfs_unlockpair(dcp, cp);
+       hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 
        if (recycle_rsrc) {
                /* inactive or reclaim on rvp will clean up the blocks from the rsrc fork */
@@ -5224,12 +5224,12 @@ out:
            wakeup((caddr_t)&tdcp->c_flag);
        }
 
+       hfs_unlockfour(fdcp, fcp, tdcp, tcp);
+
        if (took_trunc_lock) {
                hfs_unlock_truncate(VTOC(tvp), HFS_LOCK_DEFAULT);       
        }
 
-       hfs_unlockfour(fdcp, fcp, tdcp, tcp);
-       
        /* Now vnode_put the resource forks vnodes if necessary */
        if (tvp_rsrc) {
                vnode_put(tvp_rsrc);
index d53fd5fd50c59dff6656c55cea0ad4b7168944da..909ab5c1d2fa1fa061b4781110fffecf50b3a10b 100644 (file)
@@ -642,7 +642,7 @@ static OSErr  MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest
                
                if ( i != kNumExtentsToCache )                  //      if the buffer is not full, we must be done
                {
-                       err = DeleteExtents( vcb, srcFileID, forkType, quitEarly, isHFSPlus );  //      Now delete all the extent entries with the sourceID
+                       err = DeleteExtents( vcb, srcFileID, quitEarly, forkType, isHFSPlus );  //      Now delete all the extent entries with the sourceID
                        if ( DEBUG_BUILD && err != noErr )
                                DebugStr("Error from DeleteExtents");
                        break;                                                                  //      we're done!
index b49cf439c6a0ab20b0551561b38b5e9084a1ff73..79547be7fbb7ff9837bbc9b32dde23560b9bb173 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -155,31 +155,36 @@ Optimization Routines
                                        
 */
 
-#include "../../hfs_macos_defs.h"
-
 #include <sys/types.h>
 #include <sys/buf.h>
+
+
+#if !HFS_ALLOC_TEST
+
+#include "../../hfs_macos_defs.h"
 #include <sys/systm.h>
-#include <sys/sysctl.h>
-#include <sys/disk.h>
 #include <sys/ubc.h>
-#include <sys/uio.h>
 #include <kern/kalloc.h>
-#include <sys/malloc.h>
 
 /* For VM Page size */
 #include <libkern/libkern.h>
-
 #include "../../hfs.h"
-#include "../../hfs_dbg.h"
-#include "../../hfs_format.h"
 #include "../../hfs_endian.h"
-#include "../../hfs_macos_defs.h"
 #include "../headers/FileMgrInternal.h"
+#include <vfs/vfs_journal.h>
+
+#endif // !HFS_ALLOC_TEST
+
+#include <sys/sysctl.h>
+#include <sys/disk.h>
+#include <sys/uio.h>
+#include <sys/malloc.h>
+
+#include "../../hfs_dbg.h"
+#include "../../hfs_format.h"
 #include "../../hfs_kdebug.h"
 
 /* Headers for unmap-on-mount support */
-#include <vfs/vfs_journal.h>
 #include <sys/disk.h>
 
 #ifndef CONFIG_HFS_TRIM
@@ -357,6 +362,30 @@ static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBloc
 static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount);
 static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated);
 
+/* Functions for getting free exents */
+
+typedef struct bitmap_context {
+       void                    *bitmap;                                // current bitmap chunk
+       uint32_t                run_offset;                             // offset (in bits) from start of bitmap to start of current run
+       uint32_t                chunk_current;                  // next bit to scan in the chunk
+       uint32_t                chunk_end;                              // number of valid bits in this chunk
+       struct hfsmount *hfsmp;
+       struct buf              *bp;
+       uint32_t                last_free_summary_bit;  // last marked free summary bit
+       int                             lockflags;
+       uint64_t                lock_start;
+} bitmap_context_t;
+
+
+static errno_t get_more_bits(bitmap_context_t *bitmap_ctx);
+static int bit_count_set(void *bitmap, int start, int end);
+static int bit_count_clr(void *bitmap, int start, int end);
+static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count);
+static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count);
+static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count);
+static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set);
+static int clzll(uint64_t x);
+
 #if ALLOC_DEBUG
 /*
  * Validation Routine to verify that the TRIM list maintained by the journal
@@ -5153,3 +5182,462 @@ static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated)
        lck_spin_unlock(&hfsmp->vcbFreeExtLock);
 }
 
+#define BIT_RIGHT_MASK(bit)    (0xffffffffffffffffull >> (bit))
+#define kHighBitInDoubleWordMask 0x8000000000000000ull
+
+static int clzll(uint64_t x)
+{
+       if (x == 0)
+               return 64;
+       else
+               return __builtin_clzll(x);
+}
+
+#if !HFS_ALLOC_TEST
+
+static errno_t get_more_bits(bitmap_context_t *bitmap_ctx)
+{
+       uint32_t        start_bit;
+       uint32_t        iosize = 0;
+       uint32_t        byte_offset;
+       uint32_t        last_bitmap_block;
+       int                     error;
+       struct hfsmount *hfsmp = bitmap_ctx->hfsmp;
+#if !HFS_ALLOC_TEST
+       uint64_t        lock_elapsed;
+#endif
+
+
+       if (bitmap_ctx->bp)
+               ReleaseScanBitmapRange(bitmap_ctx->bp);
+       
+       if (msleep(NULL, NULL, PINOD | PCATCH,
+                          "hfs_fsinfo", NULL) == EINTR) {
+               return EINTR;
+       }
+
+#if !HFS_ALLOC_TEST
+       /*
+        * Let someone else use the allocation map after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME .
+        * lock_start is initialized in hfs_find_free_extents().
+        */
+       absolutetime_to_nanoseconds(mach_absolute_time() - bitmap_ctx->lock_start, &lock_elapsed);
+
+       if (lock_elapsed >= HFS_FSINFO_MAX_LOCKHELD_TIME) {
+
+               hfs_systemfile_unlock(hfsmp, bitmap_ctx->lockflags);
+               
+               /* add tsleep here to force context switch and fairness */
+               tsleep((caddr_t)get_more_bits, PRIBIO, "hfs_fsinfo", 1);
+
+               hfs_journal_lock(hfsmp);
+
+               /* Flush the journal and wait for all I/Os to finish up */
+               error = hfs_journal_flush(hfsmp, TRUE);
+               if (error) {
+                       hfs_journal_unlock(hfsmp);
+                       return error;
+               }
+
+               /*
+                * Take bitmap lock to ensure it is not being modified while journal is still held.
+                * Since we are reading larger than normal blocks from the bitmap, which
+                * might confuse other parts of the bitmap code using normal blocks, we
+                * take exclusive lock here.
+                */
+               bitmap_ctx->lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
+
+               bitmap_ctx->lock_start = mach_absolute_time();
+
+               /* Release the journal lock */
+               hfs_journal_unlock(hfsmp);
+
+               /*
+                * Bitmap is read in large block size (up to 1MB),
+                * unlike the runtime which reads the bitmap in the
+                * 4K block size.  If the bitmap is read by both ways
+                * at the same time, it can result in multiple buf_t with
+                * different sizes and potentially case data corruption.
+                * To avoid this, we invalidate all the existing buffers
+                * associated with the bitmap vnode.
+                */
+               error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
+               if (error) {
+                       /* hfs_systemfile_unlock will be called in the caller */
+                       return error;
+               }
+       }
+#endif
+
+       start_bit = bitmap_ctx->run_offset;
+
+       if (start_bit >= bitmap_ctx->hfsmp->totalBlocks) {
+               bitmap_ctx->chunk_end = 0;
+               bitmap_ctx->bp = NULL;
+               bitmap_ctx->bitmap = NULL;
+               return 0;
+       }
+
+       assert(start_bit % 8 == 0);
+
+       /*
+        * Compute how much I/O we should generate here.
+        * hfs_scan_range_size will validate that the start bit
+        * converted into a byte offset into the bitmap file,
+        * is aligned on a VBMIOSize boundary.
+        */
+       error = hfs_scan_range_size (bitmap_ctx->hfsmp, start_bit, &iosize);
+       if (error)
+               return error;
+
+       /* hfs_scan_range_size should have verified startbit.  Convert it to bytes */
+       byte_offset = start_bit / kBitsPerByte;
+
+       /*
+        * When the journal replays blocks, it does so by writing directly to the disk
+        * device (bypassing any filesystem vnodes and such).  When it finishes its I/Os
+        * it also immediately re-reads and invalidates the range covered by the bp so
+        * it does not leave anything lingering in the cache (for iosize reasons).
+        *
+        * As such, it is safe to do large I/Os here with ReadBitmapRange.
+        *
+        * NOTE: It is not recommended, but it is possible to call the function below
+        * on sections of the bitmap that may be in core already as long as the pages are not
+        * dirty.  In that case, we'd notice that something starting at that
+        * logical block of the bitmap exists in the metadata cache, and we'd check
+        * if the iosize requested is the same as what was already allocated for it.
+        * Odds are pretty good we're going to request something larger.  In that case,
+        * we just free the existing memory associated with the buf and reallocate a
+        * larger range. This function should immediately invalidate it as soon as we're
+        * done scanning, so this shouldn't cause any coherency issues.
+        */
+       error = ReadBitmapRange(bitmap_ctx->hfsmp, byte_offset, iosize, (uint32_t **)&bitmap_ctx->bitmap, &bitmap_ctx->bp);
+       if (error)
+               return error;
+
+       /*
+        * At this point, we have a giant wired buffer that represents some portion of
+        * the bitmap file that we want to analyze.   We may not have gotten all 'iosize'
+        * bytes though, so clip our ending bit to what we actually read in.
+        */
+       last_bitmap_block = start_bit + buf_count(bitmap_ctx->bp) * kBitsPerByte;
+
+       /* Cap the last block to the total number of blocks if required */
+       if (last_bitmap_block > bitmap_ctx->hfsmp->totalBlocks)
+               last_bitmap_block = bitmap_ctx->hfsmp->totalBlocks;
+
+       bitmap_ctx->chunk_current = 0;  // new chunk of bitmap
+       bitmap_ctx->chunk_end = last_bitmap_block - start_bit;
+
+       return 0;
+}
+
+#endif // !HFS_ALLOC_TEST
+
+// Returns number of contiguous bits set at start
+static int bit_count_set(void *bitmap, int start, int end)
+{
+       if (start == end)
+               return 0;
+
+       assert(end > start);
+
+       const int start_bit = start & 63;
+       const int end_bit   = end & 63;
+
+       uint64_t *p = (uint64_t *)bitmap + start / 64;
+       uint64_t x = ~OSSwapBigToHostInt64(*p);
+
+       if ((start & ~63) == (end & ~63)) {
+               // Start and end in same 64 bits
+               x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit);
+               return clzll(x) - start_bit;
+       }
+
+       // Deal with initial unaligned bit
+       x &= BIT_RIGHT_MASK(start_bit);
+
+       if (x)
+               return clzll(x) - start_bit;
+
+       // Go fast
+       ++p;
+       int count = 64 - start_bit;
+       int nquads = (end - end_bit - start - 1) / 64;
+
+       while (nquads--) {
+               if (*p != 0xffffffffffffffffull) {
+                       x = ~OSSwapBigToHostInt64(*p);
+                       return count + clzll(x);
+               }
+               ++p;
+               count += 64;
+       }
+
+       if (end_bit) {
+               x = ~OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit);
+               count += clzll(x);
+       }
+
+       return count;
+}
+
+/* Returns the number of a run of cleared bits:
+ *  bitmap is a single chunk of memory being examined
+ *  start: the start bit relative to the current buffer to be examined; start is inclusive.
+ *  end: the end bit relative to the current buffer to be examined; end is not inclusive.
+ */
+static int bit_count_clr(void *bitmap, int start, int end)
+{
+       if (start == end)
+               return 0;
+
+       assert(end > start);
+
+       const int start_bit = start & 63;
+       const int end_bit   = end & 63;
+
+       uint64_t *p = (uint64_t *)bitmap + start / 64;
+       uint64_t x = OSSwapBigToHostInt64(*p);
+
+       if ((start & ~63) == (end & ~63)) {
+               // Start and end in same 64 bits
+               x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit);
+
+               return clzll(x) - start_bit;
+       }
+
+       // Deal with initial unaligned bit
+       x &= BIT_RIGHT_MASK(start_bit);
+
+       if (x)
+               return clzll(x) - start_bit;
+
+       // Go fast
+       ++p;
+       int count = 64 - start_bit;
+       int nquads = (end - end_bit - start - 1) / 64;
+
+       while (nquads--) {
+               if (*p) {
+                       x = OSSwapBigToHostInt64(*p);
+                       return count + clzll(x);
+               }
+               ++p;
+               count += 64;
+       }
+
+       if (end_bit) {
+               x = OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit);
+
+               count += clzll(x);
+       }
+
+       return count;
+}
+
+#if !HFS_ALLOC_TEST
+static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set)
+{
+       uint32_t        end, start_summary_bit, end_summary_bit;
+       errno_t         error = 0;
+
+       if (count == 0)
+               goto out;
+
+       if (!ISSET(bitmap_ctx->hfsmp->hfs_flags, HFS_SUMMARY_TABLE))
+               return 0;
+
+       if (hfs_get_summary_index (bitmap_ctx->hfsmp, start, &start_summary_bit)) {
+               error = EINVAL;
+               goto out;
+       }
+
+       end = start + count - 1;
+       if (hfs_get_summary_index (bitmap_ctx->hfsmp, end, &end_summary_bit)) {
+               error = EINVAL;
+               goto out;
+       }
+
+       // if summary table bit has been updated with free block previously, leave it.
+       if ((start_summary_bit == bitmap_ctx->last_free_summary_bit) && set)
+               start_summary_bit++;
+
+       for (uint32_t summary_bit = start_summary_bit; summary_bit <= end_summary_bit; summary_bit++)
+               hfs_set_summary (bitmap_ctx->hfsmp, summary_bit, set);
+
+       if (!set)
+               bitmap_ctx->last_free_summary_bit = end_summary_bit;
+
+out:
+       return error;
+
+}
+#endif //!HFS_ALLOC_TEST
+
+/*
+ * Read in chunks of the bitmap into memory, and find a run of cleared/set bits;
+ * the run can extend across chunk boundaries.
+ * bit_count_clr can be passed to get a run of cleared bits.
+ * bit_count_set can be passed to get a run of set bits.
+ */
+static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count)
+{
+       int count;
+       errno_t error = 0;
+
+       *bit_count = 0;
+
+       do {
+               if (bitmap_ctx->run_offset == 0 || bitmap_ctx->chunk_current == bitmap_ctx->chunk_end) {
+                       if ((error = get_more_bits(bitmap_ctx)) != 0)
+                               goto out;
+               }
+
+               if (bitmap_ctx->chunk_end == 0)
+                       break;
+
+               count = fn(bitmap_ctx->bitmap, bitmap_ctx->chunk_current, bitmap_ctx->chunk_end);
+
+               bitmap_ctx->run_offset += count;
+               bitmap_ctx->chunk_current += count;
+               *bit_count += count;
+
+       } while (bitmap_ctx->chunk_current >= bitmap_ctx->chunk_end && count);
+
+out:
+       return error;
+
+}
+
+// Returns count of number of bits clear
+static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count)
+{
+       return hfs_bit_count(bitmap_ctx, bit_count_clr, count);
+}
+
+// Returns count of number of bits set
+static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count)
+{
+       return hfs_bit_count(bitmap_ctx, bit_count_set, count);
+}
+
+static uint32_t hfs_bit_offset(bitmap_context_t *bitmap_ctx)
+{
+       return bitmap_ctx->run_offset;
+}
+
+/*
+ * Perform a full scan of the bitmap file.
+ * Note: during the scan of bitmap file, it may drop and reacquire the
+ * bitmap lock to let someone else use the bitmap for fairness.
+ * Currently it is used by HFS_GET_FSINFO statistic gathing, which
+ * is run while other processes might perform HFS operations.
+ */
+
+errno_t hfs_find_free_extents(struct hfsmount *hfsmp,
+                                                         void (*callback)(void *data, off_t free_extent_size), void *callback_arg)
+{
+       struct bitmap_context bitmap_ctx;
+       uint32_t count;
+       errno_t error = 0;
+
+       if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
+               error = hfs_init_summary(hfsmp);
+               if (error)
+                       return error;
+       }
+
+       bzero(&bitmap_ctx, sizeof(struct bitmap_context));
+
+       /*
+        * The journal maintains list of recently deallocated blocks to
+        * issue DKIOCUNMAPs when the corresponding journal transaction is
+        * flushed to the disk.  To avoid any race conditions, we only
+        * want one active trim list.  Therefore we make sure that the
+        * journal trim list is sync'ed, empty, and not modifiable for
+        * the duration of our scan.
+        *
+        * Take the journal lock before flushing the journal to the disk.
+        * We will keep on holding the journal lock till we don't get the
+        * bitmap lock to make sure that no new journal transactions can
+        * start.  This will make sure that the journal trim list is not
+        * modified after the journal flush and before getting bitmap lock.
+        * We can release the journal lock after we acquire the bitmap
+        * lock as it will prevent any further block deallocations.
+        */
+       hfs_journal_lock(hfsmp);
+
+       /* Flush the journal and wait for all I/Os to finish up */
+       error = hfs_journal_flush(hfsmp, TRUE);
+       if (error) {
+               hfs_journal_unlock(hfsmp);
+               return error;
+       }
+
+       /*
+        * Take bitmap lock to ensure it is not being modified.
+        * Since we are reading larger than normal blocks from the bitmap, which
+        * might confuse other parts of the bitmap code using normal blocks, we
+        * take exclusive lock here.
+        */
+       bitmap_ctx.lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
+
+#if !HFS_ALLOC_TEST
+       bitmap_ctx.lock_start = mach_absolute_time();
+#endif
+
+       /* Release the journal lock */
+       hfs_journal_unlock(hfsmp);
+
+       /*
+        * Bitmap is read in large block size (up to 1MB),
+        * unlike the runtime which reads the bitmap in the
+        * 4K block size.  If the bitmap is read by both ways
+        * at the same time, it can result in multiple buf_t with
+        * different sizes and potentially case data corruption.
+        * To avoid this, we invalidate all the existing buffers
+        * associated with the bitmap vnode.
+        */
+       error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
+       if (error)
+               goto out;
+
+       /*
+        * Get the list of all free extent ranges.  hfs_alloc_scan_range()
+        * will call hfs_fsinfo_data_add() to account for all the free
+        * extent ranges found during scan.
+        */
+       bitmap_ctx.hfsmp = hfsmp;
+       bitmap_ctx.run_offset = 0;
+
+       while (bitmap_ctx.run_offset < hfsmp->totalBlocks) {
+
+               uint32_t start = hfs_bit_offset(&bitmap_ctx);
+
+               if ((error = hfs_bit_count_clr(&bitmap_ctx, &count)) != 0)
+                       goto out;
+
+               if (count)
+                       callback(callback_arg, hfs_blk_to_bytes(count, hfsmp->blockSize));
+
+               if ((error = update_summary_table(&bitmap_ctx, start, count, false)) != 0)
+                       goto out;
+
+               start = hfs_bit_offset(&bitmap_ctx);
+
+               if ((error = hfs_bit_count_set(&bitmap_ctx, &count)) != 0)
+                       goto out;
+
+               if ((error = update_summary_table(&bitmap_ctx, start, count, true)) != 0)
+                       goto out;
+       }
+
+out:
+       if (bitmap_ctx.lockflags) {
+               hfs_systemfile_unlock(hfsmp, bitmap_ctx.lockflags);
+       }
+
+       return error;
+}
+
index 18e64caf43db9cdd382a360b38e6bb94831ec674..30eb8a84eafd7952b2f0aca8638654a4d5c1d6d9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
 #include <sys/param.h>
 #include <sys/vnode.h>
 
+#if !HFS_ALLOC_TEST
+
 #include "../../hfs.h"
 #include "../../hfs_macos_defs.h"
 #include "../../hfs_format.h"
 #include "../../hfs_cnode.h"
 
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -255,6 +258,9 @@ ScanUnmapBlocks(struct hfsmount *hfsmp);
 EXTERN_API_C( int )
 hfs_init_summary (struct hfsmount *hfsmp);
 
+errno_t hfs_find_free_extents(struct hfsmount *hfsmp,
+                                                         void (*callback)(void *data, off_t), void *callback_arg);
+
 /*     File Extent Mapping routines*/
 EXTERN_API_C( OSErr )
 FlushExtentFile                                        (ExtendedVCB *                  vcb);
index 105922628243f3dfbbf1b2768570f21eeef29825..b344c7a9de873f9bc3c50fb7ce813e5976df4ceb 100644 (file)
@@ -287,6 +287,7 @@ void bsd_utaskbootstrap(void);
 
 static void parse_bsd_args(void);
 extern task_t bsd_init_task;
+extern boolean_t init_task_died;
 extern char    init_task_failure_data[];
 #if CONFIG_DEV_KMEM
 extern void dev_kmem_init(void);
@@ -1013,6 +1014,7 @@ bsdinit_task(void)
        ut = (uthread_t)get_bsdthread_info(thread);
 
        bsd_init_task = get_threadtask(thread);
+       init_task_died = FALSE;
        init_task_failure_data[0] = 0;
 
 #if CONFIG_MACF
index 1b89e2d4aa3ad573c6e23a0436161446c60c355f..65c98080d0554a3c64b094ddd464793dee2ccc07 100644 (file)
 
 #include <machine/pal_routines.h>
 
+extern boolean_t kdebug_serial;
+#if KDEBUG_MOJO_TRACE
+#include <sys/kdebugevents.h>
+static void kdebug_serial_print(       /* forward */
+               uint32_t, uint32_t, uint64_t,
+               uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+#endif
+
 /*
  * IOP(s)
  *
@@ -302,7 +310,6 @@ pid_t global_state_pid = -1;       /* Used to control exclusive use of kd_buffer
 #define MACH_SysCall   0x010c0000
 #define DBG_SCALL_MASK 0xffff0000
 
-
 /* task to string structure */
 struct tts
 {
@@ -392,7 +399,6 @@ kdbg_set_tracing_enabled(boolean_t enabled, uint32_t trace_type)
 {
        int s = ml_set_interrupts_enabled(FALSE);
        lck_spin_lock(kds_spin_lock);
-
        if (enabled) {
                kdebug_enable |= trace_type;
                kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG;
@@ -881,6 +887,12 @@ record_event:
        kdbp = &kdbip[coreid];
        timestamp &= KDBG_TIMESTAMP_MASK;
 
+#if KDEBUG_MOJO_TRACE
+       if (kdebug_enable & KDEBUG_ENABLE_SERIAL)
+               kdebug_serial_print(coreid, debugid, timestamp,
+                                   arg1, arg2, arg3, arg4, threadid);
+#endif
+
 retry_q:
        kds_raw = kdbp->kd_list_tail;
 
@@ -1057,6 +1069,14 @@ record_event:
 
        cpu = cpu_number();
        kdbp = &kdbip[cpu];
+
+#if KDEBUG_MOJO_TRACE
+       if (kdebug_enable & KDEBUG_ENABLE_SERIAL)
+               kdebug_serial_print(cpu, debugid,
+                                   mach_absolute_time() & KDBG_TIMESTAMP_MASK,
+                                   arg1, arg2, arg3, arg4, arg5);
+#endif
+
 retry_q:
        kds_raw = kdbp->kd_list_tail;
 
@@ -1168,7 +1188,7 @@ kernel_debug_string(const char *message)
        /* Stuff the message string in the args and log it. */
         strncpy((char *)arg, message, MIN(sizeof(arg), strlen(message)));
        KERNEL_DEBUG_EARLY(
-               (TRACEDBG_CODE(DBG_TRACE_INFO, 4)) | DBG_FUNC_NONE,
+               TRACE_INFO_STRING,
                arg[0], arg[1], arg[2], arg[3]);
 }
 
@@ -1186,8 +1206,10 @@ kernel_debug_early(
        uintptr_t       arg4)
 {
        /* If tracing is already initialized, use it */
-       if (nkdbufs)
+       if (nkdbufs) {
                KERNEL_DEBUG_CONSTANT(debugid, arg1, arg2, arg3, arg4, 0);
+               return;
+       }
 
        /* Do nothing if the buffer is full or we're not on the boot cpu */ 
        kd_early_overflow = kd_early_index >= KD_EARLY_BUFFER_MAX;
@@ -1206,7 +1228,7 @@ kernel_debug_early(
 }
 
 /*
- * Transfer the contents of the temporary buffer into the trace buffers.
+ * Transfen the contents of the temporary buffer into the trace buffers.
  * Precede that by logging the rebase time (offset) - the TSC-based time (in ns)
  * when mach_absolute_time is set to 0.
  */
@@ -1221,7 +1243,7 @@ kernel_debug_early_end(void)
        /* Fake sentinel marking the start of kernel time relative to TSC */
        kernel_debug_enter(
                0,
-               (TRACEDBG_CODE(DBG_TRACE_INFO, 1)) | DBG_FUNC_NONE,
+               TRACE_TIMESTAMPS,
                0,
                (uint32_t)(tsc_rebase_abs_time >> 32),
                (uint32_t)tsc_rebase_abs_time,
@@ -1243,7 +1265,7 @@ kernel_debug_early_end(void)
        /* Cut events-lost event on overflow */
        if (kd_early_overflow)
                KERNEL_DEBUG_CONSTANT(
-                       TRACEDBG_CODE(DBG_TRACE_INFO, 2), 0, 0, 0, 0, 0);
+                       TRACE_LOST_EVENTS, 0, 0, 0, 0, 0);
 
        /* This trace marks the start of kernel tracing */
        kernel_debug_string("early trace done");
@@ -2453,9 +2475,9 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep)
                                if (name[0] == KERN_KDWRITETR) {
                                        number = nkdbufs * sizeof(kd_buf);
 
-                                       KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_START, 0, 0, 0, 0, 0);
+                                       KERNEL_DEBUG_CONSTANT(TRACE_WRITING_EVENTS | DBG_FUNC_START, 0, 0, 0, 0, 0);
                                        ret = kdbg_read(0, &number, vp, &context);
-                                       KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_END, number, 0, 0, 0, 0);
+                                       KERNEL_DEBUG_CONSTANT(TRACE_WRITING_EVENTS | DBG_FUNC_END, number, 0, 0, 0, 0);
 
                                        *sizep = number;
                                } else {
@@ -2635,7 +2657,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx)
                return EINVAL;
 
        memset(&lostevent, 0, sizeof(lostevent));
-       lostevent.debugid = TRACEDBG_CODE(DBG_TRACE_INFO, 2);
+       lostevent.debugid = TRACE_LOST_EVENTS;
 
        /* Capture timestamp. Only sort events that have occured before the timestamp.
         * Since the iop is being flushed here, its possible that events occur on the AP
@@ -3107,7 +3129,11 @@ start_kern_tracing(unsigned int new_nkdbufs, boolean_t need_map)
                /* Hold off interrupts until the early traces are cut */
                boolean_t       s = ml_set_interrupts_enabled(FALSE);
 
-               kdbg_set_tracing_enabled(TRUE, KDEBUG_ENABLE_TRACE);
+               kdbg_set_tracing_enabled(
+                       TRUE,
+                       kdebug_serial ?
+                               (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_SERIAL) :
+                                KDEBUG_ENABLE_TRACE);
 
                /*
                 * Transfer all very early events from the static buffer
@@ -3118,8 +3144,14 @@ start_kern_tracing(unsigned int new_nkdbufs, boolean_t need_map)
                ml_set_interrupts_enabled(s);
 
                printf("kernel tracing started\n");
+#if KDEBUG_MOJO_TRACE
+               if (kdebug_serial) {
+                       printf("serial output enabled with %lu named events\n",
+                       sizeof(kd_events)/sizeof(kd_event_t));
+               }
+#endif
        } else {
-               printf("error from kdbg_reinit,kernel tracing not started\n");
+               printf("error from kdbg_reinit, kernel tracing not started\n");
        }
 }
 
@@ -3167,7 +3199,7 @@ kdbg_dump_trace_to_file(const char *filename)
                        return;
                }
        }
-       KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 0)) | DBG_FUNC_NONE, 0, 0, 0, 0, 0);
+       KERNEL_DEBUG_CONSTANT(TRACE_PANIC | DBG_FUNC_NONE, 0, 0, 0, 0, 0);
 
        kdebug_enable = 0;
        kd_ctrl_page.enabled = 0;
@@ -3209,3 +3241,146 @@ void kdbg_get_task_name(char* name_buf, int len, task_t task)
        else
                snprintf(name_buf, len, "%p [!bsd]", task);
 }
+
+#if KDEBUG_MOJO_TRACE
+static kd_event_t *
+binary_search(uint32_t id)
+{
+       int low, high, mid;
+
+       low = 0;
+       high = sizeof(kd_events)/sizeof(kd_event_t) - 1;
+
+       while (TRUE)
+       {
+               mid = (low + high) / 2;
+
+               if (low > high)
+                       return NULL; /* failed */
+               else if ( low + 1 >= high) {
+                       /* We have a match */
+                       if (kd_events[high].id == id)
+                               return &kd_events[high];
+                       else if (kd_events[low].id == id)
+                               return &kd_events[low];
+                       else
+                               return NULL;  /* search failed */
+               }
+               else if (id < kd_events[mid].id)
+                       high = mid;
+               else
+                       low = mid;
+       } 
+}
+
+/*
+ * Look up event id to get name string.
+ * Using a per-cpu cache of a single entry
+ * before resorting to a binary search of the full table.
+ */
+#define        NCACHE  1
+static kd_event_t      *last_hit[MAX_CPUS];
+static kd_event_t *
+event_lookup_cache(uint32_t cpu, uint32_t id)
+{
+       if (last_hit[cpu] == NULL || last_hit[cpu]->id != id)
+               last_hit[cpu] = binary_search(id);
+       return last_hit[cpu];
+}
+
+static uint64_t        kd_last_timstamp;
+
+static void
+kdebug_serial_print(
+       uint32_t        cpunum,
+       uint32_t        debugid,
+       uint64_t        timestamp,
+       uintptr_t       arg1,
+       uintptr_t       arg2,
+       uintptr_t       arg3,
+       uintptr_t       arg4,
+       uintptr_t       threadid
+       )
+{
+       char            kprintf_line[192];
+       char            event[40];
+       uint64_t        us = timestamp / NSEC_PER_USEC;
+       uint64_t        us_tenth = (timestamp % NSEC_PER_USEC) / 100;
+       uint64_t        delta = timestamp - kd_last_timstamp;
+       uint64_t        delta_us = delta / NSEC_PER_USEC;
+       uint64_t        delta_us_tenth = (delta % NSEC_PER_USEC) / 100;
+       uint32_t        event_id = debugid & DBG_FUNC_MASK;
+       const char      *command;
+       const char      *bra;
+       const char      *ket;
+       kd_event_t      *ep;
+
+       /* event time and delta from last */
+       snprintf(kprintf_line, sizeof(kprintf_line),
+               "%11llu.%1llu %8llu.%1llu ",
+               us, us_tenth, delta_us, delta_us_tenth);
+
+
+       /* event (id or name) - start prefixed by "[", end postfixed by "]" */
+       bra = (debugid & DBG_FUNC_START) ? "[" : " ";
+       ket = (debugid & DBG_FUNC_END)   ? "]" : " ";
+       ep = event_lookup_cache(cpunum, event_id);
+       if (ep) {
+               if (strlen(ep->name) < sizeof(event) - 3)
+                       snprintf(event, sizeof(event), "%s%s%s",
+                                bra, ep->name, ket);
+               else
+                       snprintf(event, sizeof(event), "%s%x(name too long)%s",
+                                bra, event_id, ket);
+       } else {
+               snprintf(event, sizeof(event), "%s%x%s",
+                        bra, event_id, ket);
+       }
+       snprintf(kprintf_line + strlen(kprintf_line),
+                sizeof(kprintf_line) - strlen(kprintf_line),
+                "%-40s  ", event);
+
+       /* arg1 .. arg4 with special cases for strings */
+       switch (event_id) {
+           case VFS_LOOKUP:
+           case VFS_LOOKUP_DONE:
+               if (debugid & DBG_FUNC_START) {
+                       /* arg1 hex then arg2..arg4 chars */
+                       snprintf(kprintf_line + strlen(kprintf_line),
+                               sizeof(kprintf_line) - strlen(kprintf_line),
+                               "%-16lx %-8s%-8s%-8s                          ",
+                               arg1, (char*)&arg2, (char*)&arg3, (char*)&arg4);
+                       break;
+               }
+               /* else fall through for arg1..arg4 chars */
+           case TRACE_STRING_EXEC:
+           case TRACE_STRING_NEWTHREAD:
+           case TRACE_INFO_STRING:
+               snprintf(kprintf_line + strlen(kprintf_line),
+                       sizeof(kprintf_line) - strlen(kprintf_line),
+                       "%-8s%-8s%-8s%-8s                                   ",
+                       (char*)&arg1, (char*)&arg2, (char*)&arg3, (char*)&arg4);
+               break;
+           default:
+               snprintf(kprintf_line + strlen(kprintf_line),
+                       sizeof(kprintf_line) - strlen(kprintf_line),
+                       "%-16lx %-16lx %-16lx %-16lx",
+                       arg1, arg2, arg3, arg4);
+       }
+
+       /* threadid, cpu and command name */
+       if (threadid == (uintptr_t)thread_tid(current_thread()) &&
+           current_proc() &&
+           current_proc()->p_comm)
+               command = current_proc()->p_comm;
+       else
+               command = "-";
+       snprintf(kprintf_line + strlen(kprintf_line),
+               sizeof(kprintf_line) - strlen(kprintf_line),
+               "  %-16lx  %-2d %s\n",
+               threadid, cpunum, command);
+       
+       kprintf("%s", kprintf_line);
+       kd_last_timstamp = timestamp;
+}
+#endif
index 16a66ae8293ceceee4c03f017e838c6f59b41a5e..8e5b0150bd8775b1dd5aac552cbc792e2df70012 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -317,6 +317,8 @@ ctl_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
        struct sockaddr_ctl     sa;
        struct ctl_cb           *kcb = (struct ctl_cb *)so->so_pcb;
        struct ctl_cb           *kcb_next = NULL;
+       u_quad_t                sbmaxsize;
+       u_int32_t               recvbufsize, sendbufsize;
 
        if (kcb == 0)
                panic("ctl_connect so_pcb null\n");
@@ -391,11 +393,27 @@ ctl_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
        kctlstat.kcs_connections++;
        lck_mtx_unlock(ctl_mtx);
 
-       error = soreserve(so, kctl->sendbufsize, kctl->recvbufsize);
+       /*
+        * rdar://15526688: Limit the send and receive sizes to sb_max
+        * by using the same scaling as sbreserve()
+        */
+       sbmaxsize = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
+
+       if (kctl->sendbufsize > sbmaxsize)
+               sendbufsize = sbmaxsize;
+       else
+               sendbufsize = kctl->sendbufsize;
+
+       if (kctl->recvbufsize > sbmaxsize)
+               recvbufsize = sbmaxsize;
+       else
+               recvbufsize = kctl->recvbufsize;
+
+       error = soreserve(so, sendbufsize, recvbufsize);
        if (error) {
                printf("%s - soreserve(%llx, %u, %u) error %d\n", __func__,
                        (uint64_t)VM_KERNEL_ADDRPERM(so),
-                       kctl->sendbufsize, kctl->recvbufsize, error);
+                       sendbufsize, recvbufsize, error);
                goto done;
        }
        soisconnecting(so);
@@ -631,7 +649,7 @@ ctl_rcvbspace(struct kctl *kctl, struct socket *so, u_int32_t datasize,
        struct sockbuf *sb = &so->so_rcv;
        u_int32_t space = sbspace(sb);
        errno_t error;
-       
+
        if ((kctl->flags & CTL_FLAG_REG_CRIT) == 0) {
                if ((u_int32_t) space >= datasize)
                        error = 0;
@@ -1116,10 +1134,9 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref)
 {
        struct kctl     *kctl = NULL;
        struct kctl     *kctl_next = NULL;
-       u_int32_t               id = 1;
-       size_t                  name_len;
-       int                             is_extended = 0;
-       u_quad_t        sbmaxsize;
+       u_int32_t       id = 1;
+       size_t          name_len;
+       int             is_extended = 0;
 
        if (userkctl == NULL)   /* sanity check */
                return (EINVAL);
@@ -1210,27 +1227,19 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref)
 
        /*
         * Let the caller know the default send and receive sizes
-        *
-        * rdar://15526688: Limit the send and receive sizes to sb_max
-        * by using the same scaling as sbreserve()
         */
-       sbmaxsize = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
-
-       if (userkctl->ctl_sendsize == 0)
+       if (userkctl->ctl_sendsize == 0) {
                kctl->sendbufsize = CTL_SENDSIZE;
-       else if (userkctl->ctl_sendsize > sbmaxsize)
-               kctl->sendbufsize = sbmaxsize;
-       else
-       kctl->sendbufsize = userkctl->ctl_sendsize;
-       userkctl->ctl_sendsize = kctl->sendbufsize;
-
-       if (userkctl->ctl_recvsize == 0)
+               userkctl->ctl_sendsize = kctl->sendbufsize;
+       } else {
+               kctl->sendbufsize = userkctl->ctl_sendsize;
+       }
+       if (userkctl->ctl_recvsize == 0) {
                kctl->recvbufsize = CTL_RECVSIZE;
-       else if (userkctl->ctl_recvsize > sbmaxsize)
-               kctl->recvbufsize = sbmaxsize;
-       else
-       kctl->recvbufsize = userkctl->ctl_recvsize;
-       userkctl->ctl_recvsize = kctl->recvbufsize;
+               userkctl->ctl_recvsize = kctl->recvbufsize;
+       } else {
+               kctl->recvbufsize = userkctl->ctl_recvsize;
+       }
 
        kctl->connect = userkctl->ctl_connect;
        kctl->disconnect = userkctl->ctl_disconnect;
index 3b16c4ca0e38d9bc788686bc006243ce529a3ba4..708aef4747ad4ddd1bbeef79d7c7fdf43cedaa9f 100644 (file)
@@ -87,6 +87,7 @@
 #include <kern/clock.h>
 #include <kern/thread_call.h>
 #include <kern/sched_prim.h>
+#include <kern/wait_queue.h>
 #include <kern/zalloc.h>
 #include <kern/assert.h>
 
@@ -415,6 +416,7 @@ kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
        int oktodrop;
 
        oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
+       kn->kn_status &= ~KN_STAYQUEUED;
        kn->kn_status |= KN_DROPPING;
        if (oktodrop) {
                if (kn->kn_inuse == 0) {
@@ -1180,6 +1182,7 @@ kqueue_alloc(struct proc *p)
                        kq->kq_p = p;
                } else {
                        FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE);
+                       kq = NULL;
                }
        }
 
@@ -2624,10 +2627,7 @@ knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link
        kern_return_t kr;
 
        kr = wait_queue_unlink_nofree(wq, kq->kq_wqs, wqlp);
-       kqlock(kq);
-       kn->kn_status &= ~KN_STAYQUEUED;
-       knote_dequeue(kn);
-       kqunlock(kq);
+       knote_clearstayqueued(kn);
        return ((kr != KERN_SUCCESS) ? EINVAL : 0);
 }
 
@@ -3517,3 +3517,12 @@ knote_markstayqueued(struct knote *kn)
        knote_enqueue(kn);
        kqunlock(kn->kn_kq);
 }
+
+void
+knote_clearstayqueued(struct knote *kn)
+{
+       kqlock(kn->kn_kq);
+       kn->kn_status &= ~KN_STAYQUEUED;
+       knote_dequeue(kn);
+       kqunlock(kn->kn_kq);
+}
index 3d2710538031d4d0903c33131eb019954f62193e..4816a4891c3ac3038a6ed24174bd6abf3039f343 100644 (file)
@@ -642,6 +642,13 @@ exec_fat_imgact(struct image_params *imgp)
                int nfat_arch = 0, pr = 0, f = 0;
 
                nfat_arch = OSSwapBigToHostInt32(fat_header->nfat_arch);
+
+               /* make sure bogus nfat_arch doesn't cause chaos - 19376072 */
+               if ( (sizeof(struct fat_header) + (nfat_arch * sizeof(struct fat_arch))) > PAGE_SIZE ) {
+                       error = EBADEXEC;
+                       goto bad;
+               }
+
                /* Check each preference listed against all arches in header */
                for (pr = 0; pr < NBINPREFS; pr++) {
                        cpu_type_t pref = psa->psa_binprefs[pr];
@@ -1114,14 +1121,14 @@ grade:
                kdbg_trace_string(p, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4);
 
                if (vfexec || spawn) {
-                       KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE,
+                       KERNEL_DEBUG_CONSTANT1(TRACE_DATA_EXEC | DBG_FUNC_NONE,
                                        p->p_pid ,0,0,0, (uintptr_t)thread_tid(thread));
-                       KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE,
+                       KERNEL_DEBUG_CONSTANT1(TRACE_STRING_EXEC | DBG_FUNC_NONE,
                                        dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, (uintptr_t)thread_tid(thread));
                } else {
-                       KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE,
+                       KERNEL_DEBUG_CONSTANT(TRACE_DATA_EXEC | DBG_FUNC_NONE,
                                        p->p_pid ,0,0,0,0);
-                       KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE,
+                       KERNEL_DEBUG_CONSTANT(TRACE_STRING_EXEC | DBG_FUNC_NONE,
                                        dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0);
                }
        }
@@ -2429,7 +2436,7 @@ bad:
                /* notify only if it has not failed due to FP Key error */
                if ((p->p_lflag & P_LTERM_DECRYPTFAIL) == 0)
                        proc_knote(p, NOTE_EXEC);
-       } else {
+       } else if (error == 0) {
                /* reset the importance attribute from our previous life */
                task_importance_reset(p->task);
 
index c1bff3128f991905dddac61220a65dc40884129a..3d17f687c108126913aff2c2d25c116e0d81a69e 100644 (file)
@@ -149,6 +149,7 @@ extern void dtrace_lazy_dofs_destroy(proc_t);
 
 #include <sys/sdt.h>
 
+extern boolean_t init_task_died;
 extern char init_task_failure_data[];
 void proc_prepareexit(proc_t p, int rv, boolean_t perf_notify);
 void vfork_exit(proc_t p, int rv);
@@ -354,6 +355,7 @@ exit1_internal(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, bo
                        sync(p, (void *)NULL, (int *)NULL);
                }
 #endif
+               init_task_died = TRUE;
                panic("%s died\nState at Last Exception:\n\n%s", 
                                                        (p->p_comm[0] != '\0' ?
                                                                p->p_comm :
index 1301dbeea3ce53914c8d011b735e3efa51ab54ab..23c602e8bb05d6cafe2590240b9592d914b6122b 100644 (file)
@@ -720,42 +720,34 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval)
        kauth_cred_t my_cred, my_new_cred;
        posix_cred_t my_pcred;
 
-
        uid = uap->uid;
 
+       /* get current credential and take a reference while we muck with it */
        my_cred = kauth_cred_proc_ref(p);
        my_pcred = posix_cred_get(my_cred);
 
        DEBUG_CRED_ENTER("setuid (%d/%d): %p %d\n", p->p_pid, (p->p_pptr ? p->p_pptr->p_pid : 0), my_cred, uap->uid);
        AUDIT_ARG(uid, uid);
 
-       if (uid != my_pcred->cr_ruid &&         /* allow setuid(getuid()) */
-           uid != my_pcred->cr_svuid &&        /* allow setuid(saved uid) */
-           (error = suser(my_cred, &p->p_acflag))) {
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
-       /*
-        * Everything's okay, do it.
-        */
+       for (;;) {
+               if (uid != my_pcred->cr_ruid &&         /* allow setuid(getuid()) */
+                   uid != my_pcred->cr_svuid &&        /* allow setuid(saved uid) */
+                   (error = suser(my_cred, &p->p_acflag))) {
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
 
-       /*
-        * If we are priviledged, then set the saved and real UID too;
-        * otherwise, just set the effective UID
-        */
-       if (suser(my_cred, &p->p_acflag) == 0) {
-               svuid = uid;
-               ruid = uid;
                /*
-                * Transfer proc count to new user.
-                * chgproccnt uses list lock for protection
+                * If we are privileged, then set the saved and real UID too;
+                * otherwise, just set the effective UID
                 */
-               (void)chgproccnt(uid, 1);
-               (void)chgproccnt(my_pcred->cr_ruid, -1);
-       }
-
-       /* get current credential and take a reference while we muck with it */
-       for (;;) {
+               if (suser(my_cred, &p->p_acflag) == 0) {
+                       svuid = uid;
+                       ruid = uid;
+               } else {
+                       svuid = KAUTH_UID_NONE;
+                       ruid = KAUTH_UID_NONE;
+               }
                /*
                 * Only set the gmuid if the current cred has not opt'ed out;
                 * this normally only happens when calling setgroups() instead
@@ -780,17 +772,39 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval)
 
                        DEBUG_CRED_CHANGE("setuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags);
 
+                       /*
+                        * If we're changing the ruid from A to B, we might race with another thread that's setting ruid from B to A.
+                        * The current locking mechanisms don't allow us to make the entire credential switch operation atomic,
+                        * thus we may be able to change the process credentials from ruid A to B, but get preempted before incrementing the proc
+                        * count of B. If a second thread sees the new process credentials and switches back to ruid A, that other thread
+                        * may be able to decrement the proc count of B before we can increment it. This results in a panic.
+                        * Incrementing the proc count of the target ruid, B, before setting the process credentials prevents this race.
+                        */
+                       if (ruid != KAUTH_UID_NONE) {
+                               (void)chgproccnt(ruid, 1);
+                       }
+
                        proc_lock(p);
                        /*
                         * We need to protect for a race where another thread
                         * also changed the credential after we took our
                         * reference.  If p_ucred has changed then we should
                         * restart this again with the new cred.
+                        *
+                        * Note: the kauth_cred_setresuid has consumed a reference to my_cred, it p_ucred != my_cred, then my_cred must not be dereferenced!
                         */
                        if (p->p_ucred != my_cred) {
                                proc_unlock(p);
+                               /*
+                                * We didn't successfully switch to the new ruid, so decrement
+                                * the procs/uid count that we incremented above.
+                                */
+                               if (ruid != KAUTH_UID_NONE) {
+                                       (void)chgproccnt(ruid, -1);
+                               }
                                kauth_cred_unref(&my_new_cred);
                                my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                /* try again */
                                continue;
                        }
@@ -800,6 +814,13 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval)
 
                        OSBitOrAtomic(P_SUGID, &p->p_flag);
                        proc_unlock(p);
+                       /*
+                        * If we've updated the ruid, decrement the count of procs running
+                        * under the previous ruid
+                        */
+                       if (ruid != KAUTH_UID_NONE) {
+                               (void)chgproccnt(my_pcred->cr_ruid, -1);
+                       }
                }
                break;
        }
@@ -845,18 +866,14 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval)
        my_cred = kauth_cred_proc_ref(p);
        my_pcred = posix_cred_get(my_cred);
 
-       if (euid != my_pcred->cr_ruid && euid != my_pcred->cr_svuid &&
-           (error = suser(my_cred, &p->p_acflag))) {
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
-
-       /*
-        * Everything's okay, do it.  Copy credentials so other references do
-        * not see our changes.  get current credential and take a reference 
-        * while we muck with it
-        */
        for (;;) {
+
+               if (euid != my_pcred->cr_ruid && euid != my_pcred->cr_svuid &&
+                       (error = suser(my_cred, &p->p_acflag))) {
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
+
                /* 
                 * Set the credential with new info.  If there is no change,
                 * we get back the same credential we passed in; if there is
@@ -881,6 +898,7 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval)
                                proc_unlock(p);
                                kauth_cred_unref(&my_new_cred);
                                my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                /* try again */
                                continue;
                        }
@@ -953,32 +971,25 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval)
        my_cred = kauth_cred_proc_ref(p);
        my_pcred = posix_cred_get(my_cred);
 
-       if (((ruid != KAUTH_UID_NONE &&         /* allow no change of ruid */
-             ruid != my_pcred->cr_ruid &&      /* allow ruid = ruid */
-             ruid != my_pcred->cr_uid &&       /* allow ruid = euid */
-             ruid != my_pcred->cr_svuid) ||    /* allow ruid = svuid */
-            (euid != KAUTH_UID_NONE &&         /* allow no change of euid */
-             euid != my_pcred->cr_uid &&       /* allow euid = euid */
-             euid != my_pcred->cr_ruid &&      /* allow euid = ruid */
-             euid != my_pcred->cr_svuid)) &&   /* allow euid = svui */
-           (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
-
-       /*
-        * Everything's okay, do it.  Copy credentials so other references do
-        * not see our changes.  get current credential and take a reference 
-        * while we muck with it
-        */
        for (;;) {
+
+               if (((ruid != KAUTH_UID_NONE &&         /* allow no change of ruid */
+                     ruid != my_pcred->cr_ruid &&      /* allow ruid = ruid */
+                     ruid != my_pcred->cr_uid &&       /* allow ruid = euid */
+                     ruid != my_pcred->cr_svuid) ||    /* allow ruid = svuid */
+                    (euid != KAUTH_UID_NONE &&         /* allow no change of euid */
+                     euid != my_pcred->cr_uid &&       /* allow euid = euid */
+                     euid != my_pcred->cr_ruid &&      /* allow euid = ruid */
+                     euid != my_pcred->cr_svuid)) &&   /* allow euid = svuid */
+                   (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
+
                uid_t new_euid;
-               uid_t new_ruid;
                uid_t svuid = KAUTH_UID_NONE;
 
                new_euid = my_pcred->cr_uid;
-               new_ruid = my_pcred->cr_ruid;
-       
                /* 
                 * Set the credential with new info.  If there is no change,
                 * we get back the same credential we passed in; if there is
@@ -986,19 +997,11 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval)
                 * passed in.  The subsequent compare is safe, because it is
                 * a pointer compare rather than a contents compare.
                 */
-               if (euid == KAUTH_UID_NONE && my_pcred->cr_uid != euid) {
+               if (euid != KAUTH_UID_NONE && my_pcred->cr_uid != euid) {
                        /* changing the effective UID */
                        new_euid = euid;
                        OSBitOrAtomic(P_SUGID, &p->p_flag);
                }
-               if (ruid != KAUTH_UID_NONE && my_pcred->cr_ruid != ruid) {
-                       /* changing the real UID; must do user accounting */
-                       /* chgproccnt uses list lock for protection */
-                       (void)chgproccnt(ruid, 1);
-                       (void)chgproccnt(my_pcred->cr_ruid, -1);
-                       new_ruid = ruid;
-                       OSBitOrAtomic(P_SUGID, &p->p_flag);
-               }
                /*
                 * If the newly requested real uid or effective uid does
                 * not match the saved uid, then set the saved uid to the
@@ -1017,25 +1020,56 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval)
 
                        DEBUG_CRED_CHANGE("setreuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags);
 
+                       /*
+                        * If we're changing the ruid from A to B, we might race with another thread that's setting ruid from B to A.
+                        * The current locking mechanisms don't allow us to make the entire credential switch operation atomic,
+                        * thus we may be able to change the process credentials from ruid A to B, but get preempted before incrementing the proc
+                        * count of B. If a second thread sees the new process credentials and switches back to ruid A, that other thread
+                        * may be able to decrement the proc count of B before we can increment it. This results in a panic.
+                        * Incrementing the proc count of the target ruid, B, before setting the process credentials prevents this race.
+                        */
+                       if (ruid != KAUTH_UID_NONE) {
+                               (void)chgproccnt(ruid, 1);
+                       }
+
                        proc_lock(p);
                        /*
                         * We need to protect for a race where another thread
                         * also changed the credential after we took our
                         * reference.  If p_ucred has changed then we should
                         * restart this again with the new cred.
+                        *
+                        * Note: the kauth_cred_setresuid has consumed a reference to my_cred, it p_ucred != my_cred, then my_cred must not be dereferenced!
                         */
                        if (p->p_ucred != my_cred) {
                                proc_unlock(p);
+                               if (ruid != KAUTH_UID_NONE) {
+                                       /*
+                                        * We didn't successfully switch to the new ruid, so decrement
+                                        * the procs/uid count that we incremented above.
+                                        */
+                                       (void)chgproccnt(ruid, -1);
+                               }
                                kauth_cred_unref(&my_new_cred);
                                my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                /* try again */
                                continue;
                        }
+
                        p->p_ucred = my_new_cred;
                        /* update cred on proc */
                        PROC_UPDATE_CREDS_ONPROC(p);
-                       OSBitOrAtomic(P_SUGID, &p->p_flag); /* XXX redundant? */
+                       OSBitOrAtomic(P_SUGID, &p->p_flag);
                        proc_unlock(p);
+
+                       if (ruid != KAUTH_UID_NONE) {
+                               /*
+                                * We switched to a new ruid, so decrement the count of procs running
+                                * under the previous ruid
+                                */
+                               (void)chgproccnt(my_pcred->cr_ruid, -1);
+                       }
                }
                break;
        }
@@ -1087,28 +1121,30 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval)
        gid = uap->gid;
        AUDIT_ARG(gid, gid);
 
+       /* get current credential and take a reference while we muck with it */
        my_cred = kauth_cred_proc_ref(p);
        my_pcred = posix_cred_get(my_cred);
 
-       if (gid != my_pcred->cr_rgid &&         /* allow setgid(getgid()) */
-           gid != my_pcred->cr_svgid &&        /* allow setgid(saved gid) */
-           (error = suser(my_cred, &p->p_acflag))) {
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
+       for (;;) {
+               if (gid != my_pcred->cr_rgid &&         /* allow setgid(getgid()) */
+                   gid != my_pcred->cr_svgid &&        /* allow setgid(saved gid) */
+                   (error = suser(my_cred, &p->p_acflag))) {
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
 
-       /*
-        * If we are priviledged, then set the saved and real GID too;
-        * otherwise, just set the effective GID
-        */
-       if (suser(my_cred,  &p->p_acflag) == 0) {
-               svgid = gid;
-               rgid = gid;
-       }
+               /*
+                * If we are privileged, then set the saved and real GID too;
+                * otherwise, just set the effective GID
+                */
+               if (suser(my_cred,  &p->p_acflag) == 0) {
+                       svgid = gid;
+                       rgid = gid;
+               } else {
+                       svgid = KAUTH_GID_NONE;
+                       rgid = KAUTH_GID_NONE;
+               }
 
-       /* get current credential and take a reference while we muck with it */
-       for (;;) {
-               
                /* 
                 * Set the credential with new info.  If there is no change,
                 * we get back the same credential we passed in; if there is
@@ -1133,6 +1169,7 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval)
                                kauth_cred_unref(&my_new_cred);
                                /* try again */
                                my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                continue;
                        }
                        p->p_ucred = my_new_cred;
@@ -1187,18 +1224,18 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval)
        egid = uap->egid;
        AUDIT_ARG(egid, egid);
 
+       /* get current credential and take a reference while we muck with it */
        my_cred = kauth_cred_proc_ref(p);
        my_pcred = posix_cred_get(my_cred);
 
-       if (egid != my_pcred->cr_rgid &&
-           egid != my_pcred->cr_svgid &&
-           (error = suser(my_cred, &p->p_acflag))) {
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
 
-       /* get current credential and take a reference while we muck with it */
        for (;;) {
+               if (egid != my_pcred->cr_rgid &&
+                   egid != my_pcred->cr_svgid &&
+                   (error = suser(my_cred, &p->p_acflag))) {
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
                /* 
                 * Set the credential with new info.  If there is no change,
                 * we get back the same credential we passed in; if there is
@@ -1223,6 +1260,7 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval)
                                kauth_cred_unref(&my_new_cred);
                                /* try again */
                                my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                continue;
                        }
                        p->p_ucred = my_new_cred;
@@ -1298,25 +1336,26 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval)
        AUDIT_ARG(egid, egid);
        AUDIT_ARG(rgid, rgid);
 
+       /* get current credential and take a reference while we muck with it */
        my_cred = kauth_cred_proc_ref(p);
        my_pcred = posix_cred_get(my_cred);
 
-       if (((rgid != KAUTH_UID_NONE &&         /* allow no change of rgid */
-             rgid != my_pcred->cr_rgid &&      /* allow rgid = rgid */
-             rgid != my_pcred->cr_gid &&       /* allow rgid = egid */
-             rgid != my_pcred->cr_svgid) ||    /* allow rgid = svgid */
-            (egid != KAUTH_UID_NONE &&         /* allow no change of egid */
-             egid != my_pcred->cr_groups[0] && /* allow no change of egid */
-             egid != my_pcred->cr_gid &&       /* allow egid = egid */
-             egid != my_pcred->cr_rgid &&      /* allow egid = rgid */
-             egid != my_pcred->cr_svgid)) &&   /* allow egid = svgid */
-           (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
-
-       /* get current credential and take a reference while we muck with it */
        for (;;) {
+
+               if (((rgid != KAUTH_UID_NONE &&         /* allow no change of rgid */
+                     rgid != my_pcred->cr_rgid &&      /* allow rgid = rgid */
+                     rgid != my_pcred->cr_gid &&       /* allow rgid = egid */
+                     rgid != my_pcred->cr_svgid) ||    /* allow rgid = svgid */
+                    (egid != KAUTH_UID_NONE &&         /* allow no change of egid */
+                     egid != my_pcred->cr_groups[0] && /* allow no change of egid */
+                     egid != my_pcred->cr_gid &&       /* allow egid = egid */
+                     egid != my_pcred->cr_rgid &&      /* allow egid = rgid */
+                     egid != my_pcred->cr_svgid)) &&   /* allow egid = svgid */
+                   (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
+
                uid_t new_egid = my_pcred->cr_gid;
                uid_t new_rgid = my_pcred->cr_rgid;
                uid_t svgid = KAUTH_UID_NONE;
@@ -1329,7 +1368,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval)
                 * passed in.  The subsequent compare is safe, because it is
                 * a pointer compare rather than a contents compare.
                 */
-               if (egid == KAUTH_UID_NONE && my_pcred->cr_gid != egid) {
+               if (egid != KAUTH_UID_NONE && my_pcred->cr_gid != egid) {
                        /* changing the effective GID */
                        new_egid = egid;
                        OSBitOrAtomic(P_SUGID, &p->p_flag);
@@ -1367,6 +1406,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval)
                                kauth_cred_unref(&my_new_cred);
                                /* try again */
                                my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                continue;
                        }
                        p->p_ucred = my_new_cred;
@@ -1387,7 +1427,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval)
 
 /*
  * Set the per-thread override identity.  The first parameter can be the
- * current real UID, KAUTH_UID_NONE, or, if the caller is priviledged, it
+ * current real UID, KAUTH_UID_NONE, or, if the caller is privileged, it
  * can be any UID.  If it is KAUTH_UID_NONE, then as a special case, this
  * means "revert to the per process credential"; otherwise, if permitted,
  * it changes the effective, real, and saved UIDs and GIDs for the current
index 251629b5b1a85afe64f35ab6191ddead502e666d..c7978d82e01270e2bf692e3fa9baf59fb9ccb55c 100644 (file)
@@ -180,6 +180,11 @@ extern unsigned int vm_page_free_reserved;
 extern unsigned int vm_page_speculative_percentage;
 extern unsigned int vm_page_speculative_q_age_ms;
 
+#if (DEVELOPMENT || DEBUG)
+extern uint32_t        vm_page_creation_throttled_hard;
+extern uint32_t        vm_page_creation_throttled_soft;
+#endif /* DEVELOPMENT || DEBUG */
+
 /*
  * Conditionally allow dtrace to see these functions for debugging purposes.
  */
@@ -2660,6 +2665,7 @@ SYSCTL_INT(_vm, OID_AUTO, vm_page_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &
 SYSCTL_INT(_vm, OID_AUTO, vm_page_filecache_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_page_filecache_min, 0, "");
 
 extern int     vm_compressor_mode;
+extern int     vm_compressor_is_active;
 extern uint32_t        swapout_target_age;
 extern int64_t  compressor_bytes_used;
 extern uint32_t        compressor_eval_period_in_msecs;
@@ -2673,6 +2679,7 @@ extern uint32_t   vm_compressor_unthrottle_threshold_divisor;
 extern uint32_t        vm_compressor_catchup_threshold_divisor;
 
 SYSCTL_INT(_vm, OID_AUTO, compressor_mode, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_mode, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, compressor_is_active, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_is_active, 0, "");
 SYSCTL_QUAD(_vm, OID_AUTO, compressor_bytes_used, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_bytes_used, "");
 SYSCTL_INT(_vm, OID_AUTO, compressor_swapout_target_age, CTLFLAG_RD | CTLFLAG_LOCKED, &swapout_target_age, 0, "");
 
@@ -2699,6 +2706,18 @@ SYSCTL_INT(_vm, OID_AUTO, phantom_cache_thrashing_threshold, CTLFLAG_RW | CTLFLA
 SYSCTL_INT(_vm, OID_AUTO, phantom_cache_thrashing_threshold_ssd, CTLFLAG_RW | CTLFLAG_LOCKED, &phantom_cache_thrashing_threshold_ssd, 0, "");
 #endif
 
+#if (DEVELOPMENT || DEBUG)
+
+SYSCTL_UINT(_vm, OID_AUTO, vm_page_creation_throttled_hard,
+           CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
+           &vm_page_creation_throttled_hard, 0, "");
+
+SYSCTL_UINT(_vm, OID_AUTO, vm_page_creation_throttled_soft,
+           CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
+           &vm_page_creation_throttled_soft, 0, "");
+
+#endif /* DEVELOPMENT || DEBUG */
+
 /*
  * Enable tracing of voucher contents
  */
index c590c52d69bc06494792ba87eba176923935575a..0f477bdf07927899dd8b4e6e4c62ce11d91f58da 100644 (file)
@@ -113,7 +113,8 @@ static load_result_t load_result_null = {
        .csflags = 0,
        .uuid = { 0 },
        .min_vm_addr = MACH_VM_MAX_ADDRESS,
-       .max_vm_addr = MACH_VM_MIN_ADDRESS
+       .max_vm_addr = MACH_VM_MIN_ADDRESS,
+       .cs_end_offset = 0
 };
 
 /*
@@ -772,6 +773,37 @@ parse_machfile(
                                } else {
                                        got_code_signatures = TRUE;
                                }
+
+                               if (got_code_signatures) {
+                                       boolean_t valid = FALSE, tainted = TRUE;
+                                       struct cs_blob *blobs;
+                                       vm_size_t off = 0;
+
+
+                                       if (cs_debug > 10)
+                                               printf("validating initial pages of %s\n", vp->v_name);
+                                       blobs = ubc_get_cs_blobs(vp);
+                                       
+                                       while (off < size && ret == LOAD_SUCCESS) {
+                                            valid = cs_validate_page(blobs,
+                                                                     NULL,
+                                                                     file_offset + off,
+                                                                     addr + off,
+                                                                     &tainted);
+                                            if (!valid || tainted) {
+                                                    if (cs_debug)
+                                                            printf("CODE SIGNING: %s[%d]: invalid initial page at offset %lld validated:%d tainted:%d csflags:0x%x\n", 
+                                                                   vp->v_name, p->p_pid, (long long)(file_offset + off), valid, tainted, result->csflags);
+                                                    if (cs_enforcement(NULL) ||
+                                                        (result->csflags & (CS_HARD|CS_KILL|CS_ENFORCEMENT))) {
+                                                            ret = LOAD_FAILURE;
+                                                    }
+                                                    result->csflags &= ~CS_VALID;
+                                            }
+                                            off += PAGE_SIZE;
+                                       }
+                               }
+
                                break;
 #if CONFIG_CODE_DECRYPTION
                        case LC_ENCRYPTION_INFO:
@@ -991,6 +1023,20 @@ load_segment(
        if ((scp->fileoff & PAGE_MASK_64) != 0)
                return (LOAD_BADMACHO);
 
+       /*
+        * If we have a code signature attached for this slice
+        * require that the segments are within the signed part
+        * of the file.
+        */
+       if (result->cs_end_offset &&
+           result->cs_end_offset < (off_t)scp->fileoff &&
+           result->cs_end_offset - scp->fileoff < scp->filesize)
+        {
+               if (cs_debug)
+                       printf("section outside code signature\n");
+               return LOAD_BADMACHO;
+       }
+
        /*
         *      Round sizes to page size.
         */
@@ -1290,25 +1336,46 @@ load_threadstate(
        uint32_t        size;
        int             flavor;
        uint32_t        thread_size;
+       uint32_t        *local_ts;
+       uint32_t        local_ts_size;
 
-    ret = thread_state_initialize( thread );
-    if (ret != KERN_SUCCESS) {
-        return(LOAD_FAILURE);
-    }
+       local_ts = NULL;
+       local_ts_size = 0;
+
+       ret = thread_state_initialize( thread );
+       if (ret != KERN_SUCCESS) {
+               ret = LOAD_FAILURE;
+               goto done;
+       }
     
+       if (total_size > 0) {
+               local_ts_size = total_size;
+               local_ts = kalloc(local_ts_size);
+               if (local_ts == NULL) {
+                       ret = LOAD_FAILURE;
+                       goto done;
+               }
+               memcpy(local_ts, ts, local_ts_size);
+               ts = local_ts;
+       }
+
        /*
-        *      Set the new thread state; iterate through the state flavors in
-      the mach-o file.
+        * Set the new thread state; iterate through the state flavors in
+        * the mach-o file.
         */
        while (total_size > 0) {
                flavor = *ts++;
                size = *ts++;
                if (UINT32_MAX-2 < size ||
-                   UINT32_MAX/sizeof(uint32_t) < size+2)
-                       return (LOAD_BADMACHO);
+                   UINT32_MAX/sizeof(uint32_t) < size+2) {
+                       ret = LOAD_BADMACHO;
+                       goto done;
+               }
                thread_size = (size+2)*sizeof(uint32_t);
-               if (thread_size > total_size)
-                       return(LOAD_BADMACHO);
+               if (thread_size > total_size) {
+                       ret = LOAD_BADMACHO;
+                       goto done;
+               }
                total_size -= thread_size;
                /*
                 * Third argument is a kernel space pointer; it gets cast
@@ -1317,11 +1384,19 @@ load_threadstate(
                 */
                ret = thread_setstatus(thread, flavor, (thread_state_t)ts, size);
                if (ret != KERN_SUCCESS) {
-                       return(LOAD_FAILURE);
+                       ret = LOAD_FAILURE;
+                       goto done;
                }
                ts += size;     /* ts is a (uint32_t *) */
        }
-       return(LOAD_SUCCESS);
+       ret = LOAD_SUCCESS;
+
+done:
+       if (local_ts != NULL) {
+               kfree(local_ts, local_ts_size);
+               local_ts = NULL;
+       }
+       return ret;
 }
 
 static
@@ -1584,7 +1659,7 @@ load_code_signature(
                goto out;
        }
 
-       blob = ubc_cs_blob_get(vp, cputype, -1);
+       blob = ubc_cs_blob_get(vp, cputype, macho_offset);
        if (blob != NULL) {
                /* we already have a blob for this vnode and cputype */
                if (blob->csb_cpu_type == cputype &&
@@ -1644,13 +1719,14 @@ load_code_signature(
        ubc_cs_validation_bitmap_allocate( vp );
 #endif
                
-       blob = ubc_cs_blob_get(vp, cputype, -1);
+       blob = ubc_cs_blob_get(vp, cputype, macho_offset);
 
        ret = LOAD_SUCCESS;
 out:
        if (ret == LOAD_SUCCESS) {
                result->csflags |= blob->csb_flags;
                result->platform_binary = blob->csb_platform_binary;
+               result->cs_end_offset = blob->csb_end_offset;
        }
        if (addr != 0) {
                ubc_cs_blob_deallocate(addr, blob_size);
index d1c83d1f9f12badc884aa33ad49438bbf1a8e333..dc0dbfa5bc3c1e0701ffd3a9cdd9183bc558b9ae 100644 (file)
@@ -70,6 +70,7 @@ typedef struct _load_result {
        mach_vm_address_t       min_vm_addr;
        mach_vm_address_t       max_vm_addr;
        unsigned int            platform_binary;
+       off_t                   cs_end_offset;
 } load_result_t;
 
 struct image_params;
diff --git a/bsd/kern/makekdebugevents.py b/bsd/kern/makekdebugevents.py
new file mode 100755 (executable)
index 0000000..73b2db4
--- /dev/null
@@ -0,0 +1,38 @@
+#!/usr/bin/python
+#
+# This script scans the trace.codes file, containing a mapping of event id to
+# event name for all events, and writes to stdout a C declaration for a table
+# named kd_events[] or these mappings.
+# Required to generate a header file used by DEVELOPMENT and DEBUG kernels.
+#
+import sys
+import re
+
+# we expect one arg specifying the path to the trace.codes file
+if (len(sys.argv) < 2):
+    exit(1)
+trace_code_file = sys.argv[1]
+
+# regular expression pattern to match <hex_id> <string>
+id_name_pattern = re.compile('0x([0-9a-fA-F]+)\s+([^\s]*)')
+code_table = []
+
+# scan file to generate internal table
+with open(trace_code_file, 'rt') as codes:
+    for line in codes:
+       m = id_name_pattern.match(line)
+       if m:
+            code_table += [(int(m.group(1),base=16), m.group(2))]
+
+# emit typedef:
+print "typedef struct {"
+print "        uint32_t   id;"
+print "        const char *name;"
+print "} kd_event_t;"
+# emit structure declaration and sorted initialization:
+print "kd_event_t kd_events[] = {"
+for mapping in sorted(code_table, key=lambda x: x[0]):
+        print "        {0x%x, \"%s\"}," % mapping
+print "};"
+
index 2af5cc29e2b81423594279252ad318f2b102eae4..a2b82a6e4280951e9bb32fad0a7aec1ef2f59fa0 100644 (file)
@@ -1663,6 +1663,7 @@ fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo)
                struct stat64 sb;
                int error = 0;
 
+               bzero(&sb, sizeof(struct stat64));
                context = vfs_context_create((vfs_context_t)0);
                error = vn_stat(vp, &sb, NULL, 1, context);
                (void)vfs_context_rele(context);
index c839e868f364ddcf822a0d86e00c403650e62bc9..1247ff35582114d500385bcbfd83b7bd4d05e9b5 100644 (file)
@@ -1487,7 +1487,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
                /* Handle input events */
                if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP )) {
                        kev.filter = EVFILT_READ;
-                       if (!(events & ( POLLIN | POLLRDNORM )))
+                       if (events & ( POLLPRI | POLLRDBAND ))
                                kev.flags |= EV_OOBAND;
                        kerror = kevent_register(kq, &kev, p);
                }
@@ -1559,7 +1559,7 @@ poll_callback(__unused struct kqueue *kq, struct kevent64_s *kevp, void *data)
        struct poll_continue_args *cont = (struct poll_continue_args *)data;
        struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
        short prev_revents = fds->revents;
-       short mask;
+       short mask = 0;
 
        /* convert the results back into revents */
        if (kevp->flags & EV_EOF)
@@ -1572,7 +1572,8 @@ poll_callback(__unused struct kqueue *kq, struct kevent64_s *kevp, void *data)
                if (fds->revents & POLLHUP)
                        mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
                else {
-                       mask = (POLLIN | POLLRDNORM );
+                       if ((kevp->flags & EV_ERROR) == 0 && kevp->data != 0)
+                               mask = (POLLIN | POLLRDNORM );
                        if (kevp->flags & EV_OOBAND)
                                mask |= ( POLLPRI | POLLRDBAND );
                }
index e08f59ff53766cc25fdd7e24330b80778ccd7923..57de6588d46805c05e5499f3fcf3766ed15f10bb 100644 (file)
 0x1a2001c      SFI_WAIT_CANCELED
 0x1a20020      SFI_PID_SET_MANAGED
 0x1a20024      SFI_PID_CLEAR_MANAGED
+0x1a20028      SFI_GLOBAL_DEFER
 0x1a30004      ENERGY_PERF_GPU_DESCRIPTION
 0x1a30008      ENERGY_PERF_GPU_TIME
 0x2010000      L_IP_In_Beg
 0x5310278      CPUPM_PST_UIB
 0x531027C      CPUPM_PST_PLIMIT_UIB
 0x5310280      CPUPM_IO
+0x5310284      CPUPM_FI
 0x5330000      HIBERNATE
 0x5330004      HIBERNATE_WRITE_IMAGE
 0x5330008      HIBERNATE_MACHINE_INIT
index 6b57b3cf8733da668789a871b9f52f41dafc6f3f..2dc33d759b411b7d92e1acb88711bd0e45d82eae 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -5392,6 +5392,17 @@ soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
        switch (kn->kn_filter) {
        case EVFILT_READ:
                kn->kn_fop = &soread_filtops;
+               /*
+                * If the caller explicitly asked for OOB results (e.g. poll()),
+                * save that off in the hookid field and reserve the kn_flags
+                * EV_OOBAND bit for output only).
+                */
+               if (kn->kn_flags & EV_OOBAND) {
+                       kn->kn_flags &= ~EV_OOBAND;
+                       kn->kn_hookid = EV_OOBAND;
+               } else {
+                       kn->kn_hookid = 0;
+               }
                skl = &so->so_rcv.sb_sel.si_note;
                break;
        case EVFILT_WRITE:
@@ -5467,44 +5478,42 @@ filt_soread(struct knote *kn, long hint)
        }
 
        /* socket isn't a listener */
-
        kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
+       /*
+        * Clear out EV_OOBAND that filt_soread may have set in the
+        * past.
+        */
+       kn->kn_flags &= ~EV_OOBAND;
 
-       if (so->so_oobmark) {
-               if (kn->kn_flags & EV_OOBAND) {
+       if ((so->so_oobmark) || (so->so_state & SS_RCVATMARK)){
+               kn->kn_flags |= EV_OOBAND;
+               /*
+                * If caller registered explicit interest in OOB data,
+                * return immediately (data == amount beyond mark, for
+                * legacy reasons - that should be changed later).
+                */
+               if (kn->kn_hookid == EV_OOBAND) {
+                       /*
+                        * When so_state is SS_RCVATMARK, so_oobmark
+                        * is 0.
+                        */
                        kn->kn_data -= so->so_oobmark;
                        if ((hint & SO_FILT_HINT_LOCKED) == 0)
                                socket_unlock(so, 1);
                        return (1);
                }
-               kn->kn_data = so->so_oobmark;
-               kn->kn_flags |= EV_OOBAND;
-       } else {
-               if ((so->so_state & SS_CANTRCVMORE)
+       }
+       
+       if ((so->so_state & SS_CANTRCVMORE)
 #if CONTENT_FILTER
-               && cfil_sock_data_pending(&so->so_rcv) == 0
+           && cfil_sock_data_pending(&so->so_rcv) == 0
 #endif /* CONTENT_FILTER */
-               ) {
-                       kn->kn_flags |= EV_EOF;
-                       kn->kn_fflags = so->so_error;
-                       if ((hint & SO_FILT_HINT_LOCKED) == 0)
-                               socket_unlock(so, 1);
-                       return (1);
-               }
-       }
-
-       if (so->so_state & SS_RCVATMARK) {
-               if (kn->kn_flags & EV_OOBAND) {
-                       if ((hint & SO_FILT_HINT_LOCKED) == 0)
-                               socket_unlock(so, 1);
-                       return (1);
-               }
-               kn->kn_flags |= EV_OOBAND;
-       } else if (kn->kn_flags & EV_OOBAND) {
-               kn->kn_data = 0;
+          ) {
+               kn->kn_flags |= EV_EOF;
+               kn->kn_fflags = so->so_error;
                if ((hint & SO_FILT_HINT_LOCKED) == 0)
                        socket_unlock(so, 1);
-               return (0);
+               return (1);
        }
 
        if (so->so_error) {     /* temporary udp error */
@@ -5524,7 +5533,7 @@ filt_soread(struct knote *kn, long hint)
        if ((hint & SO_FILT_HINT_LOCKED) == 0)
                socket_unlock(so, 1);
 
-       return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
+       return (kn->kn_data >= lowwat);
 }
 
 static void
index 201ad5de36ab3c0506a04b20322e423a0d3dfe7b..c3e668072bc0b61145571253ec603c095881d447 100644 (file)
@@ -1,5 +1,5 @@
 .\"
-.\" Copyright (c) 2008 Apple Inc.  All rights reserved.
+.\" Copyright (c) 2008-2015 Apple Inc.  All rights reserved.
 .\"
 .\" @APPLE_LICENSE_HEADER_START@
 .\" 
@@ -281,6 +281,9 @@ instead of the current state.  Note that some filters may automatically
 set this flag internally.
 .It EV_EOF
 Filters may set this flag to indicate filter-specific EOF condition.
+.It EV_OOBAND
+Read filter on socket may set this flag to indicate the presence of out of
+band data on the descriptor.
 .It EV_ERROR
 See
 .Sx RETURN VALUES
@@ -329,6 +332,12 @@ On return,
 .Va data
 contains the number of bytes of protocol data available to read.
 .Pp
+The presence of EV_OOBAND in
+.Va flags ,
+indicates the presence of out of band data on the socket
+.Va data
+equal to the potential number of OOB bytes availble to read.
+.Pp
 If the read direction of the socket has shutdown, then the filter
 also sets EV_EOF in
 .Va flags ,
index 0fd816243abcdd5441f328bf4be4228c0ead5014..fe762c21fa36237187ba40beaacba3d90554fe44 100644 (file)
@@ -2401,10 +2401,9 @@ filt_specdetach(struct knote *kn)
        if (ret != KERN_SUCCESS) {
                panic("filt_specdetach(): failed to unlink wait queue link.");
        }
-
+       knote_clearstayqueued(kn);
        (void)wait_queue_link_free(kn->kn_hook);
        kn->kn_hook = NULL;
-       kn->kn_status &= ~KN_STAYQUEUED;
 }
 
 static int 
index 37bea9581365c1a874d034993df8b6cca58459f0..98fff2803da508cb9b4eb7dd7fa9d3c9cf0e4d5f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -2087,6 +2087,14 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
        BRIDGE_LOCK_ASSERT_HELD(sc);
        VERIFY(ifs != NULL);
 
+       /*
+        * First, remove the member from the list first so it cannot be found anymore
+        * when we release the bridge lock below
+        */
+       BRIDGE_XLOCK(sc);
+       TAILQ_REMOVE(&sc->sc_iflist, bif, bif_next);
+       BRIDGE_XDROP(sc);
+
        if (!gone) {
                switch (ifs->if_type) {
                case IFT_ETHER:
@@ -2094,8 +2102,15 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
                        /*
                         * Take the interface out of promiscuous mode.
                         */
-                       if (bif->bif_flags & BIFF_PROMISC)
+                       if (bif->bif_flags & BIFF_PROMISC) {
+                               /*
+                                * Unlock to prevent deadlock with bridge_iff_event() in
+                                * case the driver generates an interface event
+                                */
+                               BRIDGE_UNLOCK(sc);
                                (void) ifnet_set_promiscuous(ifs, 0);
+                               BRIDGE_LOCK(sc);
+                       }
                        break;
 
                case IFT_GIF:
@@ -2123,10 +2138,6 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
                bstp_disable(&bif->bif_stp);
 #endif /* BRIDGESTP */
 
-       BRIDGE_XLOCK(sc);
-       TAILQ_REMOVE(&sc->sc_iflist, bif, bif_next);
-       BRIDGE_XDROP(sc);
-
        /*
         * If removing the interface that gave the bridge its mac address, set
         * the mac address of the bridge to the address of the next member, or
index 3ea2612a6ad38bd345b2f0204849f309f955ac6d..e0972f831c9898718c75d95481cd9dd6d0879923 100644 (file)
@@ -85,6 +85,6 @@ typedef __uint32_t n_long;            /* long as received from the net */
 typedef        __uint32_t n_time;              /* ms since 00:00 GMT, byte rev */
 
 #ifdef BSD_KERNEL_PRIVATE
-n_time  iptime(void);
+u_int32_t iptime(void);
 #endif /* BSD_KERNEL_PRIVATE */
 #endif /* _NETINET_IN_SYSTM_H_ */
index ba2869a505cd0a30e15247e0b025fb1cf09fb5ec..256d54b8fc5e02b7ff8d0cd050ab429d8649bc16 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
  */
 
 struct icmpstat icmpstat;
-SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED,
-       &icmpstat, icmpstat, "");
+SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats,
+    CTLFLAG_RD | CTLFLAG_LOCKED,
+    &icmpstat, icmpstat, "");
 
 static int     icmpmaskrepl = 0;
-SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &icmpmaskrepl, 0, "");
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl,
+    CTLFLAG_RW | CTLFLAG_LOCKED,
+    &icmpmaskrepl, 0, "");
 
 static int     icmptimestamp = 0;
-SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &icmptimestamp, 0, "");
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp,
+    CTLFLAG_RW | CTLFLAG_LOCKED,
+    &icmptimestamp, 0, "");
 
-static int     drop_redirect = 0;
-SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &drop_redirect, 0, "");
+static int     drop_redirect = 1;
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect,
+    CTLFLAG_RW | CTLFLAG_LOCKED,
+    &drop_redirect, 0, "");
 
 static int     log_redirect = 0;
-SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &log_redirect, 0, "");
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect,
+    CTLFLAG_RW | CTLFLAG_LOCKED,
+    &log_redirect, 0, "");
+
+static int icmp_datalen = 8;
 
 #if ICMP_BANDLIM 
 
@@ -192,19 +199,19 @@ icmp_error(
        struct mbuf *n,
        int type,
        int code,
-       n_long dest,
+       u_int32_t dest,
        u_int32_t nextmtu)
 {
-       struct ip *oip = mtod(n, struct ip *), *nip;
-       unsigned oiplen;
+       struct ip *oip, *nip;
        struct icmp *icp;
        struct mbuf *m;
-       unsigned icmplen;
+       u_int32_t oiphlen, icmplen, icmpelen, nlen;
 
        /* Expect 32-bit aligned data pointer on strict-align platforms */
        MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(n);
 
-       oiplen = IP_VHL_HL(oip->ip_vhl) << 2;
+       oip = mtod(n, struct ip *);
+       oiphlen = IP_VHL_HL(oip->ip_vhl) << 2;
 
 #if ICMPPRINTFS
        if (icmpprintfs)
@@ -218,44 +225,92 @@ icmp_error(
         * Don't error if the old packet protocol was ICMP
         * error message, only known informational types.
         */
-       if (oip->ip_off &(IP_MF|IP_DF))
+       if (oip->ip_off & ~(IP_MF|IP_DF))
                goto freeit;
+
        if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
-         n->m_len >= oiplen + ICMP_MINLEN &&
-         !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiplen))->
+         n->m_len >= oiphlen + ICMP_MINLEN &&
+         !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiphlen))->
          icmp_type)) {
                icmpstat.icps_oldicmp++;
                goto freeit;
        }
-       /* Don't send error in response to a multicast or broadcast packet */
+       /*
+        * Don't send error in response to a multicast or
+        * broadcast packet
+        */
        if (n->m_flags & (M_BCAST|M_MCAST))
                goto freeit;
+
+       /*
+        * Calculate the length to quote from original packet and prevent
+        * the ICMP mbuf from overflowing.
+        */
+       nlen = m_length(n);
+       if (oip->ip_p == IPPROTO_TCP) {
+               struct tcphdr *th;
+               u_int16_t tcphlen;
+
+               if (oiphlen + sizeof(struct tcphdr) > n->m_len &&
+                   n->m_next == NULL)
+                       goto stdreply;
+               if (n->m_len < (oiphlen + sizeof(struct tcphdr)) &&
+                   (n = m_pullup(n, (oiphlen + sizeof(struct tcphdr)))) == NULL)
+                       goto freeit;
+
+               th = (struct tcphdr *)(void *)((caddr_t)oip + oiphlen);
+               if (th != ((struct tcphdr *)P2ROUNDDOWN(th,
+                   sizeof(u_int32_t))))
+                       goto freeit;
+               tcphlen = th->th_off << 2;
+               if (tcphlen < sizeof(struct tcphdr))
+                       goto freeit;
+               if (oip->ip_len < (oiphlen + tcphlen))
+                       goto freeit;
+               if ((oiphlen + tcphlen) > n->m_len && n->m_next == NULL)
+                       goto stdreply;
+               if (n->m_len < (oiphlen + tcphlen) &&
+                   (n = m_pullup(n, (oiphlen + tcphlen))) == NULL)
+                       goto freeit;
+
+               icmpelen = max(tcphlen, min(icmp_datalen,
+                   (oip->ip_len - oiphlen)));
+       } else
+stdreply:      icmpelen = max(ICMP_MINLEN, min(icmp_datalen,
+                   (ntohs(oip->ip_len) - oiphlen)));
+
+       icmplen = min(oiphlen + icmpelen, min(nlen, oip->ip_len));
+       if (icmplen < sizeof(struct ip))
+               goto freeit;
        /*
         * First, formulate icmp message
         */
-       m = m_gethdr(M_DONTWAIT, MT_HEADER);    /* MAC-OK */
+       if (MHLEN > (sizeof(struct ip) + ICMP_MINLEN + icmplen))
+               m = m_gethdr(M_DONTWAIT, MT_HEADER);    /* MAC-OK */
+       else 
+               m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+
        if (m == NULL)
                goto freeit;
 
-        if (n->m_flags & M_SKIP_FIREWALL) {
-               /* set M_SKIP_FIREWALL to skip firewall check, since we're called from firewall */
+       if (n->m_flags & M_SKIP_FIREWALL) {
+               /*
+                * set M_SKIP_FIREWALL to skip firewall check, since
+                * we're called from firewall
+                */
                m->m_flags |= M_SKIP_FIREWALL;
        }
 
 #if CONFIG_MACF_NET
        mac_mbuf_label_associate_netlayer(n, m);
 #endif
-       icmplen = min(oiplen + 8, oip->ip_len);
-       if (icmplen < sizeof(struct ip)) {
-               printf("icmp_error: bad length\n");
-               m_free(m);
-               goto freeit;
-       }
-       m->m_len = icmplen + ICMP_MINLEN;
+       m->m_len = icmplen + ICMP_MINLEN; /* for ICMP header and data */
        MH_ALIGN(m, m->m_len);
        icp = mtod(m, struct icmp *);
-       if ((u_int)type > ICMP_MAXTYPE)
-               panic("icmp_error");
+       if ((u_int)type > ICMP_MAXTYPE) {
+               m_freem(m);
+               goto freeit;
+       }
        icmpstat.icps_outhist[type]++;
        icp->icmp_type = type;
        if (type == ICMP_REDIRECT)
@@ -290,8 +345,10 @@ icmp_error(
         * Now, copy old ip header (without options)
         * in front of icmp message.
         */
-       if (m->m_data - sizeof(struct ip) < m->m_pktdat)
-               panic("icmp len");
+       if (m->m_data - sizeof(struct ip) < m->m_pktdat) {
+               m_freem(m);
+               goto freeit;
+       }
        m->m_data -= sizeof(struct ip);
        m->m_len += sizeof(struct ip);
        m->m_pkthdr.len = m->m_len;
@@ -302,6 +359,7 @@ icmp_error(
        nip->ip_vhl = IP_VHL_BORING;
        nip->ip_p = IPPROTO_ICMP;
        nip->ip_tos = 0;
+       nip->ip_off = 0;
        icmp_reflect(m);
 
 freeit:
@@ -856,7 +914,7 @@ icmp_send(struct mbuf *m, struct mbuf *opts)
        ROUTE_RELEASE(&ro);
 }
 
-n_time
+u_int32_t
 iptime(void)
 {
        struct timeval atv;
index 9dde8f80e93f811a2328efa980453d7e6a0a0a7e..bc1bb0f2f0f64846306fcf763e3c625a38d74da7 100644 (file)
@@ -2041,7 +2041,7 @@ ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop)
        struct in_ifaddr *ia;
        int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
        struct in_addr *sin, dst;
-       n_time ntime;
+       u_int32_t ntime;
        struct sockaddr_in ipaddr = {
            sizeof (ipaddr), AF_INET, 0, { 0 }, { 0, } };
 
@@ -2305,8 +2305,6 @@ nosourcerouting:
        }
        return (0);
 bad:
-       /* XXX icmp_error adds in hdr length */
-       ip->ip_len -= IP_VHL_HL(ip->ip_vhl) << 2;
        icmp_error(m, type, code, 0, 0);
        OSAddAtomic(1, &ipstat.ips_badoptions);
        return (1);
index 7e2d00b07e4839aaeb72522d9695fc2e89f2c901..2eb86f1a91e8645e4f21ff5ac4f5fad74b26bfa1 100644 (file)
@@ -161,8 +161,10 @@ tcp_cubic_update(struct tcpcb *tp, u_int32_t rtt)
        float K, var;
        u_int32_t elapsed_time, win;
 
-       VERIFY(tp->t_ccstate->cub_last_max > 0);
        win = min(tp->snd_cwnd, tp->snd_wnd);
+       if (tp->t_ccstate->cub_last_max == 0)
+               tp->t_ccstate->cub_last_max = tp->snd_ssthresh;
+
        if (tp->t_ccstate->cub_epoch_start == 0) {
                /*
                 * This is the beginning of a new epoch, initialize some of
index 0cfb8d9533c6bb42fd8a32f5e6251dfeac224023..d7a7130a518139f432ed09f330f65f167e0b8c5d 100644 (file)
@@ -67,7 +67,7 @@
 #ifdef PRIVATE
 
 struct tcp_debug {
-       n_time  td_time;
+       u_int32_t td_time;
        short   td_act;
        short   td_ostate;
        caddr_t td_tcb;
index ee974e44ec5973c24c3766c9f2bd04688d04140d..53e362c89e16b1d5f86f5cce4fc2863054464bb3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -476,7 +476,7 @@ u_int32_t   rip6_sendspace = RIPV6SNDQ;
 u_int32_t      rip6_recvspace = RIPV6RCVQ;
 
 /* ICMPV6 parameters */
-int    icmp6_rediraccept = 1;          /* accept and process redirects */
+int    icmp6_rediraccept = 0;          /* accept and process redirects */
 int    icmp6_redirtimeout = 10 * 60;   /* 10 minutes */
 int    icmp6errppslim = 500;           /* 500 packets per second */
 int    icmp6rappslim = 10;             /* 10 packets per second */
index 266547020ffb7c39857b752369126a1a50f3b496..4cc199ca8bc0b53ee9ac2c6225b599d2199a3ec7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -605,27 +605,15 @@ ip6_input(struct mbuf *m)
        }
 
        ip6stat.ip6s_nxthist[ip6->ip6_nxt]++;
-
-#if IPFW2
-       /*
-        * Check with the firewall...
-        */
-       if (ip6_fw_enable && ip6_fw_chk_ptr) {
-               u_short port = 0;
-               /* If ipfw says divert, we have to just drop packet */
-               /* use port as a dummy argument */
-               if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) {
-                       m_freem(m);
-                       m = NULL;
-               }
-               if (!m)
-                       goto done;
-       }
-#endif /* IPFW2 */
-
        /*
         * Check against address spoofing/corruption.
         */
+       if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP) &&
+           IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src)) {
+               ip6stat.ip6s_badscope++;
+               in6_ifstat_inc(inifp, ifs6_in_addrerr);
+               goto bad;
+       }
        if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
            IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
                /*
@@ -681,6 +669,22 @@ ip6_input(struct mbuf *m)
                goto bad;
        }
 #endif
+#if IPFW2
+        /*
+         * Check with the firewall...
+         */
+        if (ip6_fw_enable && ip6_fw_chk_ptr) {
+                u_short port = 0;
+                /* If ipfw says divert, we have to just drop packet */
+                /* use port as a dummy argument */
+                if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) {
+                        m_freem(m);
+                        m = NULL;
+                }
+                if (!m)
+                        goto done;
+        }
+#endif /* IPFW2 */
 
        /*
         * Naively assume we can attribute inbound data to the route we would
index a58e5d866ab4899add3726b19fa39f0b408e2420..c0715a2b0267b0e962183eeffc569ceea733cda3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -3797,7 +3797,7 @@ again:
                }
        }
 
-       if (req->r_achain.tqe_next == NFSREQNOLIST)
+       if (req->r_achain.tqe_next == NFSREQNOLIST || req->r_achain.tqe_next == NFSIODCOMPLETING)
                TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
 
        /* If this mount doesn't already have an nfsiod working on it... */
index ad7d5a2714ea3e5da2b95cbd8b0de4ae5f5a066d..49d487f53d438338a5131fdb13fc60447392758a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -4302,8 +4302,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
                nfs4_mount_callback_shutdown(nmp);
 
        /* Destroy any RPCSEC_GSS contexts */
-       if (!TAILQ_EMPTY(&nmp->nm_gsscl))
-               nfs_gss_clnt_ctx_unmount(nmp);
+       nfs_gss_clnt_ctx_unmount(nmp);
 
        /* mark the socket for termination */
        lck_mtx_lock(&nmp->nm_lock);
index 880af7e3d08f4123bc3a7077055c47e7f0b59e16..30a5166b8993667706adaad2cd9b65119f3290d1 100644 (file)
@@ -135,7 +135,7 @@ EXPORT_MI_LIST      = ${KERNELFILES} ${PRIVATE_KERNELFILES} linker_set.h bsdtask_info
                                                                vnode_internal.h proc_internal.h file_internal.h mount_internal.h \
                                                                uio_internal.h tree.h munge.h kern_tests.h
 
-EXPORT_MI_GEN_LIST = syscall.h sysproto.h
+EXPORT_MI_GEN_LIST = syscall.h sysproto.h kdebugevents.h
 
 EXPORT_MI_DIR = sys
 
@@ -150,9 +150,10 @@ INSTALL_KF_MI_LCL_GEN_LIST = sysproto.h
 # /System/Library/Frameworks/Kernel.framework/Headers
 INSTALL_KF_MI_LIST = ${KERNELFILES}
 
-INSTALL_KF_MI_GEN_LIST = 
+INSTALL_KF_MI_GEN_LIST =
 
 MAKESYSCALLS = $(SRCROOT)/bsd/kern/makesyscalls.sh
+MAKEKDEBUGEVENTS = $(SRCROOT)/bsd/kern/makekdebugevents.py
 
 $(OBJROOT)/cscope.genhdrs:
        $(_v)mkdir -p $(OBJROOT)/cscope.genhdrs
@@ -167,6 +168,11 @@ sysproto.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) $(OBJROOT)/cscop
        @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path
        $(_v)$(MAKESYSCALLS) $< proto > /dev/null
 
+kdebugevents.h:  $(SRCROOT)/bsd/kern/trace.codes $(MAKEKDEBUGEVENTS) $(OBJROOT)/cscope.genhdrs
+       @echo "Generating bsd/kern/$@ from $<";
+       @echo "$(OBJPATH)/bsd/kern/$@" > $(OBJROOT)/cscope.genhdrs/$@.path
+       $(_v)$(MAKEKDEBUGEVENTS) $< > "$(OBJPATH)/bsd/sys/$@"
+
 MAKE_POSIX_AVAILABILITY = $(SRCROOT)/bsd/sys/make_posix_availability.sh
 _posix_availability.h: $(MAKE_POSIX_AVAILABILITY)
        @echo "Generating bsd/sys/$@"
index fa41389d41657b83ca06039ebc72f38c188b482f..3e39fca6a4be23023d72ba0f48d64d1db1746795 100644 (file)
@@ -26,6 +26,8 @@
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ *
+ * Portions Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_DTRACE_H
@@ -103,6 +105,7 @@ extern "C" {
 
 #define S_ROUND(x, a)   ((x) + (((a) ? (a) : 1) - 1) & ~(((a) ? (a) : 1) - 1))
 #define P2ROUNDUP(x, align)             (-(-(x) & -(align)))
+#define        P2PHASEUP(x, align, phase)      ((phase) - (((phase) - (x)) & -(align)))
 
 #define        CTF_MODEL_ILP32 1       /* object data model is ILP32 */
 #define        CTF_MODEL_LP64  2       /* object data model is LP64 */
@@ -1046,10 +1049,10 @@ typedef struct dtrace_ecbdesc {
  * DTrace Metadata Description Structures
  *
  * DTrace separates the trace data stream from the metadata stream.  The only
- * metadata tokens placed in the data stream are enabled probe identifiers
- * (EPIDs) or (in the case of aggregations) aggregation identifiers.  In order
- * to determine the structure of the data, DTrace consumers pass the token to
- * the kernel, and receive in return a corresponding description of the enabled
+ * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID +
+ * timestamp) or (in the case of aggregations) aggregation identifiers.  To
+ * determine the structure of the data, DTrace consumers pass the token to the
+ * kernel, and receive in return a corresponding description of the enabled
  * probe (via the dtrace_eprobedesc structure) or the aggregation (via the
  * dtrace_aggdesc structure).  Both of these structures are expressed in terms
  * of record descriptions (via the dtrace_recdesc structure) that describe the
@@ -1147,11 +1150,12 @@ typedef struct dtrace_fmtdesc {
 #define        DTRACEOPT_AGGHIST       27      /* histogram aggregation output */
 #define        DTRACEOPT_AGGPACK       28      /* packed aggregation output */
 #define        DTRACEOPT_AGGZOOM       29      /* zoomed aggregation scaling */
+#define        DTRACEOPT_TEMPORAL      30      /* temporally ordered output */
 #if !defined(__APPLE__)
-#define DTRACEOPT_MAX           30      /* number of options */
-#else
-#define DTRACEOPT_STACKSYMBOLS  30      /* clear to prevent stack symbolication */
 #define DTRACEOPT_MAX           31      /* number of options */
+#else
+#define DTRACEOPT_STACKSYMBOLS  31      /* clear to prevent stack symbolication */
+#define DTRACEOPT_MAX           32      /* number of options */
 #endif /* __APPLE__ */
 
 #define        DTRACEOPT_UNSET         (dtrace_optval_t)-2     /* unset option */
@@ -1172,7 +1176,9 @@ typedef struct dtrace_fmtdesc {
  * where user-level wishes the kernel to snapshot the buffer to (the
  * dtbd_data field).  The kernel uses the same structure to pass back some
  * information regarding the buffer:  the size of data actually copied out, the
- * number of drops, the number of errors, and the offset of the oldest record.
+ * number of drops, the number of errors, the offset of the oldest record,
+ * and the time of the snapshot.
+ *
  * If the buffer policy is a "switch" policy, taking a snapshot of the
  * principal buffer has the additional effect of switching the active and
  * inactive buffers.  Taking a snapshot of the aggregation buffer _always_ has
@@ -1185,8 +1191,29 @@ typedef struct dtrace_bufdesc {
         uint64_t dtbd_drops;                    /* number of drops */
         DTRACE_PTR(char, dtbd_data);            /* data */
         uint64_t dtbd_oldest;                   /* offset of oldest record */
+       uint64_t dtbd_timestamp;                /* hrtime of snapshot */
 } dtrace_bufdesc_t;
 
+/*
+ * Each record in the buffer (dtbd_data) begins with a header that includes
+ * the epid and a timestamp.  The timestamp is split into two 4-byte parts
+ * so that we do not require 8-byte alignment.
+ */
+typedef struct dtrace_rechdr {
+       dtrace_epid_t dtrh_epid;                /* enabled probe id */
+       uint32_t dtrh_timestamp_hi;             /* high bits of hrtime_t */
+       uint32_t dtrh_timestamp_lo;             /* low bits of hrtime_t */
+} dtrace_rechdr_t;
+
+#define        DTRACE_RECORD_LOAD_TIMESTAMP(dtrh)                      \
+       ((dtrh)->dtrh_timestamp_lo +                            \
+       ((uint64_t)(dtrh)->dtrh_timestamp_hi << 32))
+
+#define        DTRACE_RECORD_STORE_TIMESTAMP(dtrh, hrtime) {           \
+       (dtrh)->dtrh_timestamp_lo = (uint32_t)hrtime;           \
+       (dtrh)->dtrh_timestamp_hi = hrtime >> 32;               \
+}
+
 /*
  * DTrace Status
  *
index 71dc020f2985b08c34a58cbea9f8e86a9cc287d2..cbb14c0abd651fcade830c0dbb5fff44da1acf20 100644 (file)
@@ -22,6 +22,8 @@
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ *
+ * Portions Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_DTRACE_IMPL_H
@@ -202,15 +204,18 @@ typedef struct dtrace_hash {
  * predicate is non-NULL, the DIF object is executed.  If the result is
  * non-zero, the action list is processed, with each action being executed
  * accordingly.  When the action list has been completely executed, processing
- * advances to the next ECB.  processing advances to the next ECB.  If the
- * result is non-zero; For each ECB, it first determines the The ECB
- * abstraction allows disjoint consumers to multiplex on single probes.
+ * advances to the next ECB. The ECB abstraction allows disjoint consumers
+ * to multiplex on single probes.
+ *
+ * Execution of the ECB results in consuming dte_size bytes in the buffer
+ * to record data.  During execution, dte_needed bytes must be available in
+ * the buffer.  This space is used for both recorded data and tuple data.
  */
 struct dtrace_ecb {
        dtrace_epid_t dte_epid;                 /* enabled probe ID */
        uint32_t dte_alignment;                 /* required alignment */
-       size_t dte_needed;                      /* bytes needed */
-       size_t dte_size;                        /* total size of payload */
+       size_t dte_needed;                      /* space needed for execution */
+       size_t dte_size;                        /* size of recorded payload */
        dtrace_predicate_t *dte_predicate;      /* predicate, if any */
        dtrace_action_t *dte_action;            /* actions, if any */
        dtrace_ecb_t *dte_next;                 /* next ECB on probe */
@@ -268,27 +273,30 @@ typedef struct dtrace_aggregation {
  * the EPID, the consumer can determine the data layout.  (The data buffer
  * layout is shown schematically below.)  By assuring that one can determine
  * data layout from the EPID, the metadata stream can be separated from the
- * data stream -- simplifying the data stream enormously.
- *
- *      base of data buffer --->  +------+--------------------+------+
- *                                | EPID | data               | EPID |
- *                                +------+--------+------+----+------+
- *                                | data          | EPID | data      |
- *                                +---------------+------+-----------+
- *                                | data, cont.                      |
- *                                +------+--------------------+------+
- *                                | EPID | data               |      |
- *                                +------+--------------------+      |
- *                                |                ||                |
- *                                |                ||                |
- *                                |                \/                |
- *                                :                                  :
- *                                .                                  .
- *                                .                                  .
- *                                .                                  .
- *                                :                                  :
- *                                |                                  |
- *     limit of data buffer --->  +----------------------------------+
+ * data stream -- simplifying the data stream enormously.  The ECB always
+ * proceeds the recorded data as part of the dtrace_rechdr_t structure that
+ * includes the EPID and a high-resolution timestamp used for output ordering
+ * consistency.
+ *
+ *      base of data buffer --->  +--------+--------------------+--------+
+ *                                | rechdr | data               | rechdr |
+ *                                +--------+------+--------+----+--------+
+ *                                | data          | rechdr | data        |
+ *                                +---------------+--------+-------------+
+ *                                | data, cont.                          |
+ *                                +--------+--------------------+--------+
+ *                                | rechdr | data               |        |
+ *                                +--------+--------------------+        |
+ *                                |                ||                    |
+ *                                |                ||                    |
+ *                                |                \/                    |
+ *                                :                                      :
+ *                                .                                      .
+ *                                .                                      .
+ *                                .                                      .
+ *                                :                                      :
+ *                                |                                      |
+ *     limit of data buffer --->  +--------------------------------------+
  *
  * When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the
  * principal buffer (both scratch and payload) exceed the available space.  If
@@ -426,6 +434,8 @@ typedef struct dtrace_buffer {
 #ifndef _LP64
        uint64_t dtb_pad1;
 #endif
+       uint64_t dtb_switched;                  /* time of last switch */
+       uint64_t dtb_interval;                  /* observed switch interval */
 } dtrace_buffer_t;
 
 /*
index 66efc61b012cce808b79076cbfd4d1eaf0fd4db9..44cef5438af4fb00afefeb29a56aabd303fd1b32 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -177,14 +177,20 @@ struct kevent64_s {
  * (which always returns true for regular files - regardless of the amount
  * of unread data in the file).
  *
- * On input, EV_OOBAND specifies that only OOB data should be looked for.
- * The returned data count is the number of bytes beyond the current OOB marker.
+ * On input, EV_OOBAND specifies that filter should actively return in the
+ * presence of OOB on the descriptor. It implies that filter will return
+ * if there is OOB data available to read OR when any other condition
+ * for the read are met (for example number of bytes regular data becomes >=
+ * low-watermark).
+ * If EV_OOBAND is not set on input, it implies that the filter should not actively
+ * return for out of band data on the descriptor. The filter will then only return
+ * when some other condition for read is met (ex: when number of regular data bytes
+ * >=low-watermark OR when socket can't receive more data (SS_CANTRCVMORE)).
  *
- * On output, EV_OOBAND indicates that OOB data is present
+ * On output, EV_OOBAND indicates the presence of OOB data on the descriptor.
  * If it was not specified as an input parameter, then the data count is the
- * number of bytes before the current OOB marker. If at the marker, the
- * data count indicates the number of bytes available after it.  In either
- * case, it's the amount of data one could expect to receive next.
+ * number of bytes before the current OOB marker, else data count is the number
+ * of bytes beyond OOB marker.
  */
 #define EV_POLL        EV_FLAG0
 #define EV_OOBAND      EV_FLAG1
@@ -474,7 +480,7 @@ extern int  knote_link_wait_queue(struct knote *kn, struct wait_queue *wq, wait_q
 extern int     knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t *wqlp);
 extern void    knote_fdclose(struct proc *p, int fd);
 extern void    knote_markstayqueued(struct knote *kn);
-
+extern void    knote_clearstayqueued(struct knote *kn);
 #endif /* !KERNEL_PRIVATE */
 
 #else  /* KERNEL */
index 186ace8f3818a4f23332dd92e690949828f71c45..af75e23a1033a424531bd244e3692e95d98757a2 100644 (file)
@@ -291,15 +291,15 @@ extern int kdebug_trace(uint32_t code, uint64_t arg1, uint64_t arg2, uint64_t ar
 /* Codes for Selective Forced Idle (DBG_MACH_SFI) */
 #define SFI_SET_WINDOW                 0x0
 #define SFI_CANCEL_WINDOW              0x1
-#define SFI_SET_CLASS_OFFTIME  0x2
+#define SFI_SET_CLASS_OFFTIME          0x2
 #define SFI_CANCEL_CLASS_OFFTIME       0x3
 #define SFI_THREAD_DEFER               0x4
 #define SFI_OFF_TIMER                  0x5
 #define SFI_ON_TIMER                   0x6
 #define SFI_WAIT_CANCELED              0x7
 #define SFI_PID_SET_MANAGED            0x8
-#define SFI_PID_CLEAR_MANAGED  0x9
-
+#define SFI_PID_CLEAR_MANAGED          0x9
+#define SFI_GLOBAL_DEFER               0xa
 /* **** The Kernel Debug Sub Classes for Network (DBG_NETWORK) **** */
 #define DBG_NETIP      1       /* Internet Protocol */
 #define DBG_NETARP     2       /* Address Resolution Protocol */
@@ -462,11 +462,17 @@ extern int kdebug_trace(uint32_t code, uint64_t arg1, uint64_t arg2, uint64_t ar
 #define DBG_TRACE_STRING    1
 #define        DBG_TRACE_INFO      2
 
-/*
- * TRACE_DATA_NEWTHREAD                        0x1
- * TRACE_DATA_EXEC                     0x2
- */
-#define TRACE_DATA_THREAD_TERMINATE    0x3     /* thread has been queued for deallocation and can no longer run */
+/* The Kernel Debug events: */
+#define        TRACE_DATA_NEWTHREAD            (TRACEDBG_CODE(DBG_TRACE_DATA, 1))
+#define        TRACE_DATA_EXEC                 (TRACEDBG_CODE(DBG_TRACE_DATA, 2))
+#define        TRACE_DATA_THREAD_TERMINATE     (TRACEDBG_CODE(DBG_TRACE_DATA, 3))
+#define        TRACE_STRING_NEWTHREAD          (TRACEDBG_CODE(DBG_TRACE_STRING, 1))
+#define        TRACE_STRING_EXEC               (TRACEDBG_CODE(DBG_TRACE_STRING, 2))
+#define        TRACE_PANIC                     (TRACEDBG_CODE(DBG_TRACE_INFO, 0))
+#define        TRACE_TIMESTAMPS                (TRACEDBG_CODE(DBG_TRACE_INFO, 1))
+#define        TRACE_LOST_EVENTS               (TRACEDBG_CODE(DBG_TRACE_INFO, 2))
+#define        TRACE_WRITING_EVENTS            (TRACEDBG_CODE(DBG_TRACE_INFO, 3))
+#define        TRACE_INFO_STRING               (TRACEDBG_CODE(DBG_TRACE_INFO, 4))
 
 /* The Kernel Debug Sub Classes for DBG_CORESTORAGE */
 #define DBG_CS_IO      0
@@ -638,6 +644,7 @@ extern unsigned int kdebug_enable;
 #define KDEBUG_ENABLE_ENTROPY 0x2              /* Obsolescent */
 #define KDEBUG_ENABLE_CHUD    0x4
 #define KDEBUG_ENABLE_PPT     0x8
+#define KDEBUG_ENABLE_SERIAL 0x10
 
 /*
  * Infer the supported kernel debug event level from config option.
@@ -1053,6 +1060,14 @@ typedef struct {
 /* Minimum value allowed when setting decrementer ticks */
 #define KDBG_MINRTCDEC  2500
 
+/* VFS lookup events for serial traces */
+#define VFS_LOOKUP     (FSDBG_CODE(DBG_FSRW,36))
+#define VFS_LOOKUP_DONE        (FSDBG_CODE(DBG_FSRW,39))
+
+#if (DEVELOPMENT || DEBUG)
+#define KDEBUG_MOJO_TRACE 1
+#endif
+
 #endif /* __APPLE_API_PRIVATE */
 #endif /* PRIVATE */
 
index fd363613211bb670bb456ddee59785554f7f3e7e..6a90031c6798426385818b020a232d2eaf75d9e5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2014 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -643,6 +643,7 @@ add_fsevent(int type, vfs_context_t ctx, ...)
                VATTR_WANTED(&va, va_mode);
                VATTR_WANTED(&va, va_uid);
                VATTR_WANTED(&va, va_gid);
+               VATTR_WANTED(&va, va_nlink);
                if ((ret = vnode_getattr(vp, &va, vfs_context_kernel())) != 0) {
                    // printf("add_fsevent: failed to getattr on vp %p (%d)\n", cur->fref.vp, ret);
                    cur->str = NULL;
@@ -655,6 +656,12 @@ add_fsevent(int type, vfs_context_t ctx, ...)
                cur->mode = (int32_t)vnode_vttoif(vnode_vtype(vp)) | va.va_mode;
                cur->uid  = va.va_uid;
                cur->gid  = va.va_gid;
+               if (vp->v_flag & VISHARDLINK) {
+                       cur->mode |= FSE_MODE_HLINK;
+                       if ((vp->v_type == VDIR && va.va_dirlinkcount == 0) || (vp->v_type == VREG && va.va_nlink == 0)) {
+                               cur->mode |= FSE_MODE_LAST_HLINK;
+                       }
+               }
 
                // if we haven't gotten the path yet, get it.
                if (pathbuff == NULL) {
index 579542e1652dd6300d929a1f99855e5de150c316..4beff12a6e9c4364e61e38158991e94e3b74d066 100644 (file)
@@ -1679,9 +1679,9 @@ kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, boolean_t l
         * entries, we must mark the start of the path's string and the end.
         */
        if (lookup == TRUE)
-               code = (FSDBG_CODE(DBG_FSRW,36)) | DBG_FUNC_START;
+               code = VFS_LOOKUP | DBG_FUNC_START;
        else
-               code = (FSDBG_CODE(DBG_FSRW,39)) | DBG_FUNC_START;
+               code = VFS_LOOKUP_DONE | DBG_FUNC_START;
 
        if (dbg_namelen <= (int)(3 * sizeof(long)))
                code |= DBG_FUNC_END;
index 521cb571366c8f1cc3490b021a40fe5666064802..ba37a4e38abddf71ed2b369c4e70e114928dc11e 100644 (file)
@@ -9235,6 +9235,24 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
        is64bit = proc_is64bit(p);
 
        memp = NULL;
+
+       /*
+        * ensure the buffer is large enough for underlying calls
+        */
+#ifndef HFSIOC_GETPATH
+typedef char pn_t[MAXPATHLEN];
+#define HFSIOC_GETPATH  _IOWR('h', 13, pn_t)
+#endif
+
+#ifndef HFS_GETPATH
+#define HFS_GETPATH  IOCBASECMD(HFSIOC_GETPATH)
+#endif
+       if (IOCBASECMD(cmd) == HFS_GETPATH) {
+               /* Round up to MAXPATHLEN regardless of user input */
+               size = MAXPATHLEN;
+       }
+
+
        if (size > sizeof (stkbuf)) {
                if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
                data = memp;
index 1e84dc0e92931851c7a13c1e503fd2b532ff8664..747e47ed953022119eacd22dd20cfed9fb68273d 100644 (file)
@@ -1,4 +1,4 @@
-14.1.0
+14.3.0
 
 # The first line of this file contains the master version number for the kernel.
 # All other instances of the kernel version in xnu are derived from this file.
index 173309543942dcab8e399626e25ecedde9908ced..d7995f81e141003d5fdd9158a433591ae30f0c26 100644 (file)
@@ -24,10 +24,20 @@ _buf_create_shadow
 _buf_kernel_addrperm_addr
 _buf_setfilter
 _buf_shadow
+_bufattr_alloc
+_bufattr_dup
+_bufattr_free
+_bufattr_greedymode
+_bufattr_isochronous
+_bufattr_markgreedymode
+_bufattr_markisochronous
+_bufattr_markmeta
+_bufattr_markquickcomplete
 _bufattr_meta
 _bufattr_nocache
-_bufattr_throttled
 _bufattr_passive
+_bufattr_quickcomplete
+_bufattr_throttled
 _cdevsw
 _cdevsw_setkqueueok
 _chudxnu_platform_ptr
diff --git a/libsyscall/mach/.gitignore b/libsyscall/mach/.gitignore
new file mode 100644 (file)
index 0000000..f718d68
--- /dev/null
@@ -0,0 +1,3 @@
+*.pbxuser
+*.perspectivev3
+build/
index f1ba710627fde0b99002eb687fc91c4d2d08ecf5..92e9547bfbab77f865fe85a984ed0535a1bbcd55 100644 (file)
@@ -337,6 +337,9 @@ atm_get_value(
                                if (kr != KERN_SUCCESS) {
                                        break;
                                }
+                       } else {
+                               kr = KERN_INVALID_TASK;
+                               break;
                        }
 
                        /* Increment sync value. */
@@ -939,8 +942,8 @@ atm_listener_insert(
                         */
                        next->mailbox = mailbox;
                        lck_mtx_unlock(&atm_value->listener_lock);
-                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_REPLACED))) | DBG_FUNC_NONE,
-                               atm_value, atm_value->aid, mailbox_offset, 0, 0);
+                       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_REPLACED))) | DBG_FUNC_NONE,
+                               VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0);
 
                        /* Drop the extra reference on task descriptor taken by this function. */
                        atm_task_descriptor_dealloc(task_descriptor);
@@ -948,8 +951,8 @@ atm_listener_insert(
                        return KERN_SUCCESS;
                }
        }
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_ADDED))) | DBG_FUNC_NONE,
-                               atm_value, atm_value->aid, mailbox_offset, 0, 0);
+       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_ADDED))) | DBG_FUNC_NONE,
+                               VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0);
 
        queue_enter(&atm_value->listeners, new_link_object, atm_link_object_t, listeners_element);
        atm_value->listener_count++;
@@ -1006,18 +1009,18 @@ atm_listener_delete(
 
                if (elem->descriptor == task_descriptor) {
                        if (elem->mailbox == mailbox) {
-                               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_UNREGISTER_INFO,
+                               KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO,
                                        (ATM_VALUE_UNREGISTERED))) | DBG_FUNC_NONE,
-                                       atm_value, atm_value->aid, mailbox_offset, 0, 0);
+                                       VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0);
                                queue_remove(&atm_value->listeners, elem, atm_link_object_t, listeners_element);
                                queue_enter(&free_listeners, elem, atm_link_object_t, listeners_element);
                                atm_value->listener_count--;
                                kr = KERN_SUCCESS;
                                break;
                        } else {
-                               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_UNREGISTER_INFO,
+                               KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO,
                                        (ATM_VALUE_DIFF_MAILBOX))) | DBG_FUNC_NONE,
-                                       atm_value, atm_value->aid, 0, 0, 0);
+                                       VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, 0, 0, 0);
                                kr = KERN_INVALID_VALUE;
                                break;
                        }
@@ -1255,7 +1258,7 @@ atm_get_min_sub_aid_array(
        atm_value_t atm_value;
        uint32_t i;
 
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_START,
+       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_START,
                        0, 0, 0, 0, 0);
 
        for (i = 0; i < count; i++) {
@@ -1268,7 +1271,7 @@ atm_get_min_sub_aid_array(
                atm_value_dealloc(atm_value);
        }
 
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_END,
+       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_END,
                        count, 0, 0, 0, 0);
 
 }
@@ -1292,7 +1295,7 @@ atm_get_min_sub_aid(atm_value_t atm_value)
        atm_link_object_t next, elem;
        queue_head_t free_listeners;
 
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_START,
+       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_START,
                        0, 0, 0, 0, 0);
 
        lck_mtx_lock(&atm_value->listener_lock);
@@ -1385,7 +1388,7 @@ atm_get_min_sub_aid(atm_value_t atm_value)
                atm_link_dealloc(next);
        }
        
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_END,
+       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_END,
                        j, freed_count, dead_but_not_freed, 0, 0);
 
        /* explicitly upgrade uint32_t to 64 bit mach size */
index b935e14c62334b9c86a8b99743eedbd376e68161..8b00349ebde1cc4fd851a041e755b8c661209ade 100644 (file)
@@ -80,6 +80,7 @@
 #include <device/device_port.h>
 
 ipc_port_t     master_device_port;
+void        *master_device_kobject;
 
 lck_grp_attr_t * dev_lck_grp_attr;
 lck_grp_t * dev_lck_grp;
@@ -93,8 +94,8 @@ device_service_create(void)
        if (master_device_port == IP_NULL)
            panic("can't allocate master device port");
 
-       ipc_kobject_set(master_device_port, 1, IKOT_MASTER_DEVICE);
-       kernel_set_special_port(host_priv_self(), HOST_IO_MASTER_PORT,
+    ipc_kobject_set(master_device_port, (ipc_kobject_t)&master_device_kobject, IKOT_MASTER_DEVICE);
+    kernel_set_special_port(host_priv_self(), HOST_IO_MASTER_PORT,
                                ipc_port_make_send(master_device_port));
 
        /* allocate device lock group attribute and group */
index fb9157b88ec82daa09d848c6afc938b7ce96540a..35d4bf27973e946dd4ab5f2eb00ef4433383dfa0 100644 (file)
@@ -74,6 +74,7 @@
 #include <mach/vm_prot.h>
 #include <mach/machine.h>
 #include <mach/time_value.h>
+#include <sys/kdebug.h>
 #include <kern/spl.h>
 #include <kern/assert.h>
 #include <kern/debug.h>
 #include <architecture/i386/pio.h> /* inb() */
 #include <pexpert/i386/boot.h>
 
+#include <kdp/kdp_dyld.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #endif
 
 static void machine_conf(void);
+void panic_print_symbol_name(vm_address_t search);
 
+extern boolean_t init_task_died;
+extern const char      version[];
+extern char    osversion[];
 extern int             max_unsafe_quanta;
 extern int             max_poll_quanta;
 extern unsigned int    panic_is_inited;
 
+extern int     proc_pid(void *p);
+
+/* Definitions for frame pointers */
+#define FP_ALIGNMENT_MASK      ((uint32_t)(0x3))
+#define FP_LR_OFFSET           ((uint32_t)4)
+#define FP_LR_OFFSET64         ((uint32_t)8)
+#define FP_MAX_NUM_TO_EVALUATE (50)
+
 int db_run_mode;
 
 volatile int pbtcpu = -1;
@@ -155,6 +169,93 @@ static unsigned    commit_paniclog_to_nvram;
 
 unsigned int debug_boot_arg;
 
+/*
+ * Backtrace a single frame.
+ */
+void
+print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker,
+       boolean_t is_64_bit, boolean_t nvram_format) 
+{
+       int                 i = 0;
+       addr64_t        lr;
+       addr64_t        fp;
+       addr64_t        fp_for_ppn;
+       ppnum_t         ppn;
+       boolean_t       dump_kernel_stack;
+
+       fp = topfp;
+       fp_for_ppn = 0;
+       ppn = (ppnum_t)NULL;
+
+       if (fp >= VM_MIN_KERNEL_ADDRESS)
+               dump_kernel_stack = TRUE;
+       else
+               dump_kernel_stack = FALSE;
+
+       do {
+               if ((fp == 0) || ((fp & FP_ALIGNMENT_MASK) != 0))
+                       break;
+               if (dump_kernel_stack && ((fp < VM_MIN_KERNEL_ADDRESS) || (fp > VM_MAX_KERNEL_ADDRESS)))
+                       break;
+               if ((!dump_kernel_stack) && (fp >=VM_MIN_KERNEL_ADDRESS))
+                       break;
+                       
+        /* Check to see if current address will result in a different
+           ppn than previously computed (to avoid recomputation) via
+           (addr) ^ fp_for_ppn) >> PAGE_SHIFT) */
+
+               if ((((fp + FP_LR_OFFSET) ^ fp_for_ppn) >> PAGE_SHIFT) != 0x0U) {
+                       ppn = pmap_find_phys(pmap, fp + FP_LR_OFFSET);
+                       fp_for_ppn = fp + (is_64_bit ? FP_LR_OFFSET64 : FP_LR_OFFSET);
+               }
+               if (ppn != (ppnum_t)NULL) {
+                       if (is_64_bit) {
+                               lr = ml_phys_read_double_64(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET64) & PAGE_MASK));
+                       } else {
+                               lr = ml_phys_read_word(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET) & PAGE_MASK));
+                       }
+               } else {
+                       if (is_64_bit) {
+                               kdb_printf("%s\t  Could not read LR from frame at 0x%016llx\n", cur_marker, fp + FP_LR_OFFSET64);
+                       } else {
+                               kdb_printf("%s\t  Could not read LR from frame at 0x%08x\n", cur_marker, (uint32_t)(fp + FP_LR_OFFSET));
+                       }
+                       break;
+               }
+               if (((fp ^ fp_for_ppn) >> PAGE_SHIFT) != 0x0U) {
+                       ppn = pmap_find_phys(pmap, fp);
+                       fp_for_ppn = fp;
+               }
+               if (ppn != (ppnum_t)NULL) {
+                       if (is_64_bit) {
+                               fp = ml_phys_read_double_64(((((vm_offset_t)ppn) << PAGE_SHIFT)) | (fp & PAGE_MASK));
+                       } else {
+                               fp = ml_phys_read_word(((((vm_offset_t)ppn) << PAGE_SHIFT)) | (fp & PAGE_MASK));
+                       }
+               } else {
+                       if (is_64_bit) {
+                               kdb_printf("%s\t  Could not read FP from frame at 0x%016llx\n", cur_marker, fp);
+                       } else {
+                               kdb_printf("%s\t  Could not read FP from frame at 0x%08x\n", cur_marker, (uint32_t)fp);
+                       }
+                       break;
+               }
+
+               if (nvram_format) {
+                       if (is_64_bit) {
+                               kdb_printf("%s\t0x%016llx\n", cur_marker, lr);
+                       } else {
+                               kdb_printf("%s\t0x%08x\n", cur_marker, (uint32_t)lr);
+                       }
+               } else {                
+                       if (is_64_bit) {
+                               kdb_printf("%s\t  lr: 0x%016llx  fp: 0x%016llx\n", cur_marker, lr, fp);
+                       } else {
+                               kdb_printf("%s\t  lr: 0x%08x  fp: 0x%08x\n", cur_marker, (uint32_t)lr, (uint32_t)fp);
+                       }
+               }
+       } while ((++i < FP_MAX_NUM_TO_EVALUATE) && (fp != topfp));
+}
 void
 machine_startup(void)
 {
@@ -171,6 +272,12 @@ machine_startup(void)
                if (debug_boot_arg & DB_PRT) disable_debug_output=FALSE; 
                if (debug_boot_arg & DB_SLOG) systemLogDiags=TRUE; 
                if (debug_boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE;
+#if KDEBUG_MOJO_TRACE
+               if (debug_boot_arg & DB_PRT_KDEBUG) {
+                       kdebug_serial = TRUE;
+                       disable_debug_output = FALSE;
+               }
+#endif
        } else {
                debug_boot_arg = 0;
        }
@@ -757,6 +864,16 @@ machine_halt_cpu(void) {
        pmCPUHalt(PM_HALT_DEBUG);
 }
 
+static int pid_from_task(task_t task)
+{
+        int pid = -1;
+
+        if (task->bsd_info)
+                pid = proc_pid(task->bsd_info);
+
+        return pid;
+}
+
 void
 DebuggerWithContext(
        __unused unsigned int   reason,
@@ -773,6 +890,9 @@ Debugger(
        unsigned long pi_size = 0;
        void *stackptr;
        int cn = cpu_number();
+       task_t task = current_task();
+       int     task_pid = pid_from_task(task);
+
 
        hw_atomic_add(&debug_mode, 1);   
        if (!panic_is_inited) {
@@ -802,7 +922,12 @@ Debugger(
                __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr));
 
                /* Print backtrace - callee is internally synchronized */
-               panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), NULL, FALSE, NULL);
+               if ((task_pid == 1) && (init_task_died)) {
+                       /* Special handling of launchd died panics */
+                       print_launchd_info();
+               } else {
+                       panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), NULL, FALSE, NULL);
+               }
 
                /* everything should be printed now so copy to NVRAM
                 */
@@ -994,7 +1119,7 @@ panic_print_kmod_symbol_name(vm_address_t search)
     }
 }
 
-static void
+void
 panic_print_symbol_name(vm_address_t search)
 {
     /* try searching in the kernel */
@@ -1138,3 +1263,184 @@ out:
        bt_tsc_timeout = rdtsc64() + PBT_TIMEOUT_CYCLES;
        while(*ppbtcnt && (rdtsc64() < bt_tsc_timeout));
 }
+
+static boolean_t
+debug_copyin(pmap_t p, uint64_t uaddr, void *dest, size_t size)
+{
+        size_t rem = size;
+        char *kvaddr = dest;
+
+        while (rem) {
+                ppnum_t upn = pmap_find_phys(p, uaddr);
+                uint64_t phys_src = ptoa_64(upn) | (uaddr & PAGE_MASK);
+                uint64_t phys_dest = kvtophys((vm_offset_t)kvaddr);
+                uint64_t src_rem = PAGE_SIZE - (phys_src & PAGE_MASK);
+                uint64_t dst_rem = PAGE_SIZE - (phys_dest & PAGE_MASK);
+                size_t cur_size = (uint32_t) MIN(src_rem, dst_rem);
+                cur_size = MIN(cur_size, rem);
+
+                if (upn && pmap_valid_page(upn) && phys_dest) {
+                        bcopy_phys(phys_src, phys_dest, cur_size);
+                }
+                else
+                        break;
+                uaddr += cur_size;
+                kvaddr += cur_size;
+                rem -= cur_size;
+        }
+        return (rem == 0);
+}
+
+void
+print_threads_registers(thread_t thread)
+{
+       x86_saved_state_t *savestate;
+       
+       savestate = get_user_regs(thread);
+       kdb_printf(
+               "\nRAX: 0x%016llx, RBX: 0x%016llx, RCX: 0x%016llx, RDX: 0x%016llx\n"
+           "RSP: 0x%016llx, RBP: 0x%016llx, RSI: 0x%016llx, RDI: 0x%016llx\n"
+           "R8:  0x%016llx, R9:  0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n"
+               "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n"
+               "RFL: 0x%016llx, RIP: 0x%016llx, CS:  0x%016llx, SS:  0x%016llx\n\n",
+               savestate->ss_64.rax, savestate->ss_64.rbx, savestate->ss_64.rcx, savestate->ss_64.rdx,
+               savestate->ss_64.isf.rsp, savestate->ss_64.rbp, savestate->ss_64.rsi, savestate->ss_64.rdi,
+               savestate->ss_64.r8, savestate->ss_64.r9,  savestate->ss_64.r10, savestate->ss_64.r11,
+               savestate->ss_64.r12, savestate->ss_64.r13, savestate->ss_64.r14, savestate->ss_64.r15,
+               savestate->ss_64.isf.rflags, savestate->ss_64.isf.rip, savestate->ss_64.isf.cs,
+               savestate->ss_64.isf.ss);
+}
+
+void
+print_tasks_user_threads(task_t task)
+{
+       thread_t                thread = current_thread();
+       x86_saved_state_t *savestate;
+       pmap_t                  pmap = 0;
+       uint64_t                rbp;
+       const char              *cur_marker = 0;
+       int             j;
+       
+       for (j = 0, thread = (thread_t) queue_first(&task->threads); j < task->thread_count;
+                       ++j, thread = (thread_t) queue_next(&thread->task_threads)) {
+
+               kdb_printf("Thread  %p\n", thread);
+               pmap = get_task_pmap(task);
+               savestate = get_user_regs(thread);
+               rbp = savestate->ss_64.rbp;
+               print_one_backtrace(pmap, (vm_offset_t)rbp, cur_marker, TRUE, TRUE);
+               kdb_printf("\n");
+               }
+}
+
+#define PANICLOG_UUID_BUF_SIZE 256
+
+void print_uuid_info(task_t task)
+{
+       uint32_t                uuid_info_count = 0;
+       mach_vm_address_t       uuid_info_addr = 0;
+       boolean_t               have_map = (task->map != NULL) &&       (ml_validate_nofault((vm_offset_t)(task->map), sizeof(struct _vm_map)));
+       boolean_t               have_pmap = have_map && (task->map->pmap != NULL) && (ml_validate_nofault((vm_offset_t)(task->map->pmap), sizeof(struct pmap)));
+       int                             task_pid = pid_from_task(task);
+       char                    uuidbuf[PANICLOG_UUID_BUF_SIZE] = {0};
+       char                    *uuidbufptr = uuidbuf;
+       uint32_t                k;
+
+       if (have_pmap && task->active && task_pid > 0) {
+               /* Read dyld_all_image_infos struct from task memory to get UUID array count & location */
+               struct user64_dyld_all_image_infos task_image_infos;
+               if (debug_copyin(task->map->pmap, task->all_image_info_addr,
+                       &task_image_infos, sizeof(struct user64_dyld_all_image_infos))) {
+                       uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
+                       uuid_info_addr = task_image_infos.uuidArray;
+               }
+
+               /* If we get a NULL uuid_info_addr (which can happen when we catch dyld
+                * in the middle of updating this data structure), we zero the
+                * uuid_info_count so that we won't even try to save load info for this task
+                */
+               if (!uuid_info_addr) {
+                       uuid_info_count = 0;
+               }
+       }
+
+       if (task_pid > 0 && uuid_info_count > 0) {
+               uint32_t uuid_info_size = sizeof(struct user64_dyld_uuid_info);
+               uint32_t uuid_array_size = uuid_info_count * uuid_info_size;
+               uint32_t uuid_copy_size = 0;
+               uint32_t uuid_image_count = 0;
+               char *current_uuid_buffer = NULL;
+               /* Copy in the UUID info array. It may be nonresident, in which case just fix up nloadinfos to 0 */
+               
+               kdb_printf("\nuuid info:\n");
+               while (uuid_array_size) {
+                       if (uuid_array_size <= PANICLOG_UUID_BUF_SIZE) {
+                               uuid_copy_size = uuid_array_size;
+                               uuid_image_count = uuid_array_size/uuid_info_size;
+                       } else {
+                               uuid_image_count = PANICLOG_UUID_BUF_SIZE/uuid_info_size;
+                               uuid_copy_size = uuid_image_count * uuid_info_size;
+                       }
+                       if (have_pmap && !debug_copyin(task->map->pmap, uuid_info_addr, uuidbufptr,
+                               uuid_copy_size)) {
+                               kdb_printf("Error!! Failed to copy UUID info for task %p pid %d\n", task, task_pid);
+                               uuid_image_count = 0;
+                               break;
+                       }
+
+                       if (uuid_image_count > 0) {
+                               current_uuid_buffer = uuidbufptr;
+                               for (k = 0; k < uuid_image_count; k++) {
+                                       kdb_printf(" %#llx", *(uint64_t *)current_uuid_buffer);
+                                       current_uuid_buffer += sizeof(uint64_t);
+                                       uint8_t *uuid = (uint8_t *)current_uuid_buffer;
+                                       kdb_printf("\tuuid = <%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>\n",
+                                       uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], uuid[6], uuid[7], uuid[8],
+                                       uuid[9], uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]);
+                                       current_uuid_buffer += 16;
+                               }
+                               bzero(&uuidbuf, sizeof(uuidbuf));
+                       }
+                       uuid_info_addr += uuid_copy_size;
+                       uuid_array_size -= uuid_copy_size;
+               }
+       }
+}
+
+void print_launchd_info(void)
+{
+       task_t          task = current_task();
+       thread_t        thread = current_thread();
+       volatile        uint32_t *ppbtcnt = &pbtcnt;
+       uint64_t        bt_tsc_timeout;
+       int             cn = cpu_number();
+
+       if(pbtcpu != cn) {
+               hw_atomic_add(&pbtcnt, 1);
+               /* Spin on print backtrace lock, which serializes output
+                * Continue anyway if a timeout occurs.
+                */
+               hw_lock_to(&pbtlock, ~0U);
+               pbtcpu = cn;
+       }
+       
+       print_uuid_info(task);
+       print_threads_registers(thread);
+       print_tasks_user_threads(task);
+       kdb_printf("Mac OS version: %s\n", (osversion[0] != 0) ? osversion : "Not yet set");
+       kdb_printf("Kernel version: %s\n", version);
+       panic_display_kernel_uuid();
+       panic_display_model_name();
+       
+       /* Release print backtrace lock, to permit other callers in the
+        * event of panics on multiple processors.
+        */
+       hw_lock_unlock(&pbtlock);
+       hw_atomic_sub(&pbtcnt, 1);
+       /* Wait for other processors to complete output
+        * Timeout and continue after PBT_TIMEOUT_CYCLES.
+        */
+       bt_tsc_timeout = rdtsc64() + PBT_TIMEOUT_CYCLES;
+       while(*ppbtcnt && (rdtsc64() < bt_tsc_timeout));
+
+}
index e42d4aef22bbf9744fe52d54cb89853d50c96497..76617a0da90dd32591e606b7b4a20337cd6f8fb8 100644 (file)
@@ -35,6 +35,9 @@
 #if CONFIG_MTRR
 #include <i386/mtrr.h>
 #endif
+#if HYPERVISOR
+#include <kern/hv_support.h>
+#endif
 #if CONFIG_VMX
 #include <i386/vmx/vmx_cpu.h>
 #endif
@@ -193,6 +196,11 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon)
        /* Save power management timer state */
        pmTimerSave();
 
+#if HYPERVISOR
+       /* Notify hypervisor that we are about to sleep */
+       hv_suspend();
+#endif
+
 #if CONFIG_VMX
        /* 
         * Turn off VT, otherwise switching to legacy mode will fail
index 532c49ee344ae9ec083e8d0eb736c432df924eea..9cfd5892b4c6322e5033f39314c7b2eae16b5447 100644 (file)
@@ -762,12 +762,10 @@ cpuid_set_cpufamily(i386_cpu_info_t *info_p)
                case CPUID_MODEL_CRYSTALWELL:
                        cpufamily = CPUFAMILY_INTEL_HASWELL;
                        break;
-#if !defined(XNU_HIDE_SEED)
                case CPUID_MODEL_BROADWELL:
                case CPUID_MODEL_BRYSTALWELL:
                        cpufamily = CPUFAMILY_INTEL_BROADWELL;
                        break;
-#endif /* not XNU_HIDE_SEED */
                }
                break;
        }
@@ -944,9 +942,7 @@ leaf7_feature_map[] = {
        {CPUID_LEAF7_FEATURE_RTM,      "RTM"},
        {CPUID_LEAF7_FEATURE_RDSEED,   "RDSEED"},
        {CPUID_LEAF7_FEATURE_ADX,      "ADX"},
-#if !defined(XNU_HIDE_SEED)
        {CPUID_LEAF7_FEATURE_SMAP,     "SMAP"},
-#endif /* not XNU_HIDE_SEED */
        {0, 0}
 };
 
index 1f58d5250ab404f60951e99e8ed7b78ab497e33b..980945d50a80ba4943166263530f07eefed497d4 100644 (file)
 #define CPUID_LEAF7_FEATURE_RTM      _Bit(11)  /* RTM */
 #define CPUID_LEAF7_FEATURE_RDSEED   _Bit(18)  /* RDSEED Instruction */
 #define CPUID_LEAF7_FEATURE_ADX      _Bit(19)  /* ADX Instructions */
-#if !defined(XNU_HIDE_SEED)
 #define CPUID_LEAF7_FEATURE_SMAP     _Bit(20)  /* Supervisor Mode Access Protect */
-#endif /* not XNU_HIDE_SEED */
 
 /*
  * The CPUID_EXTFEATURE_XXX values define 64-bit values
 #define CPUID_MODEL_HASWELL            0x3C
 #define CPUID_MODEL_HASWELL_EP         0x3F
 #define CPUID_MODEL_HASWELL_ULT                0x45
-#if !defined(XNU_HIDE_SEED)
 #define CPUID_MODEL_BROADWELL          0x3D
 #define CPUID_MODEL_BROADWELL_ULX      0x3D
 #define CPUID_MODEL_BROADWELL_ULT      0x3D
 #define CPUID_MODEL_BRYSTALWELL                0x47
-#endif /* not XNU_HIDE_SEED */
 
 #define CPUID_VMM_FAMILY_UNKNOWN       0x0
 #define CPUID_VMM_FAMILY_VMWARE                0x1
index 113031cfac737e181f2715655173227771eae530..e561a690026fc6abd94b6f194371e5b5f9887661 100644 (file)
@@ -167,7 +167,7 @@ void panic_dump_mem(const void *addr, int len)
        }
 }
 
-bool panic_phys_range_before(const void *addr, uint64_t *pphys, 
+boolean_t panic_phys_range_before(const void *addr, uint64_t *pphys, 
                                                         panic_phys_range_t *range)
 {
        *pphys = kvtophys((vm_offset_t)addr);
index 92905ebb402e030c58957a00b119d931c3eba86d..10b38e5758e47e465a155d83f94611cfd340ef95 100644 (file)
@@ -32,7 +32,7 @@
 #if XNU_KERNEL_PRIVATE
 
 #include <stdint.h>
-#include <stdbool.h>
+#include <mach/i386/boolean.h>
 
 typedef struct {
        uint64_t        opaque[6];
@@ -53,7 +53,7 @@ typedef struct panic_phys_range {
        uint64_t len;
 } panic_phys_range_t;
 
-bool panic_phys_range_before(const void *addr, uint64_t *pphys, 
+boolean_t panic_phys_range_before(const void *addr, uint64_t *pphys, 
                                                         panic_phys_range_t *range);
 
 #endif // XNU_KERNEL_PRIVATE
index 2229ab8a82ce1c2618d9419ce7a3e1d41bcba76d..ba0f1b1e5494eaf373df2dd7ef915ea55e474402 100644 (file)
@@ -373,6 +373,16 @@ static inline void invlpg(uintptr_t addr)
        __asm__  volatile("invlpg (%0)" :: "r" (addr) : "memory");
 }
 
+static inline void clac(void)
+{
+       __asm__  volatile("clac");
+}
+
+static inline void stac(void)
+{
+       __asm__  volatile("stac");
+}
+
 /*
  * Access to machine-specific registers (available on 586 and better only)
  * Note: the rd* operations modify the parameters directly (without using
index 0cedaa19dd6d9268b06abb3a646c1142662d0caa..3a99e32a7de3cc328f70cf115a0a67db5e88cf05 100644 (file)
@@ -624,6 +624,17 @@ kernel_trap(
                                        goto debugger_entry;
                                }
 
+                               /*
+                                * Additionally check for SMAP faults...
+                                * which are characterized by page-present and
+                                * the AC bit unset (i.e. not from copyin/out path).
+                                */
+                               if (__improbable(code & T_PF_PROT &&
+                                                pmap_smap_enabled &&
+                                                (saved_state->isf.rflags & EFL_AC) == 0)) {
+                                       goto debugger_entry;
+                               }
+
                                /*
                                 * If we're not sharing cr3 with the user
                                 * and we faulted in copyio,
@@ -802,6 +813,7 @@ panic_trap(x86_saved_state64_t *regs)
        const char      *trapname = "Unknown";
        pal_cr_t        cr0, cr2, cr3, cr4;
        boolean_t       potential_smep_fault = FALSE, potential_kernel_NX_fault = FALSE;
+       boolean_t       potential_smap_fault = FALSE;
 
        pal_get_control_registers( &cr0, &cr2, &cr3, &cr4 );
        assert(ml_get_interrupts_enabled() == FALSE);
@@ -826,6 +838,12 @@ panic_trap(x86_saved_state64_t *regs)
                } else if (regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
                        potential_kernel_NX_fault = TRUE;
                }
+       } else if (pmap_smap_enabled &&
+                  regs->isf.trapno == T_PAGE_FAULT &&
+                  regs->isf.err & T_PF_PROT &&
+                  regs->cr2 < VM_MAX_USER_PAGE_ADDRESS &&
+                  regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
+               potential_smap_fault = TRUE;
        }
 
 #undef panic
@@ -848,7 +866,7 @@ panic_trap(x86_saved_state64_t *regs)
              virtualized ? " VMM" : "",
              potential_kernel_NX_fault ? " Kernel NX fault" : "",
              potential_smep_fault ? " SMEP/User NX fault" : "",
-             "");
+             potential_smap_fault ? " SMAP fault" : "");
        /*
         * This next statement is not executed,
         * but it's needed to stop the compiler using tail call optimization
index 5eed5e2d12aeeff5aa1982dfe5abae8817a96ed7..619f87eaf4741efa2d951218596e696c995f09f6 100644 (file)
@@ -151,6 +151,12 @@ extern volatile perfASTCallback perfASTHook;
 extern volatile perfCallback perfIntHook;
 
 extern void            panic_i386_backtrace(void *, int, const char *, boolean_t, x86_saved_state_t *);
+extern void    print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker,     boolean_t is_64_bit, boolean_t nvram_format);
+extern void    print_tasks_user_threads(task_t task);
+extern void    print_threads_registers(thread_t thread);
+extern void    print_uuid_info(task_t task);
+extern void    print_launchd_info(void);
+
 #if MACH_KDP
 extern boolean_t       kdp_i386_trap(
                                unsigned int,
index 67c303afeae526ab61f567e5bcd19116dea4aaf5..cc0acd9127d0156109b7a4adbb56149dc760a5c7 100644 (file)
@@ -480,6 +480,7 @@ mach_port_kobject(
        kaddr = (mach_vm_address_t)port->ip_kobject;
        ip_unlock(port);
 
+
        if (0 != kaddr && is_ipc_kobject(*typep))
                *addrp = VM_KERNEL_UNSLIDE_OR_PERM(kaddr);
        else
index 3fc7a40a71a78c040efad7800e6e88e11da93c89..c2edb90698487d9979142a53aad9160541baf997 100644 (file)
@@ -49,6 +49,7 @@
 /* BSD KERN COMPONENT INTERFACE */
 
 task_t bsd_init_task = TASK_NULL;
+boolean_t init_task_died;
 char   init_task_failure_data[1024];
 extern unsigned int not_in_kdp; /* Skip acquiring locks if we're in kdp */
  
index 6dc10f748c70426f062e09e73960e8cadcdeb328..6f527a66fdb67e6c44fdbb5ff665e28b3d651177 100644 (file)
@@ -101,6 +101,7 @@ unsigned int        disable_debug_output = TRUE;
 unsigned int   systemLogDiags = FALSE;
 unsigned int   panicDebugging = FALSE;
 unsigned int   logPanicDataToScreen = FALSE;
+unsigned int   kdebug_serial = FALSE;
 
 int mach_assert = 1;
 
@@ -497,7 +498,7 @@ void populate_model_name(char *model_string) {
        strlcpy(model_name, model_string, sizeof(model_name));
 }
 
-static void panic_display_model_name(void) {
+void panic_display_model_name(void) {
        char tmp_model_name[sizeof(model_name)];
 
        if (ml_nofault_copy((vm_offset_t) &model_name, (vm_offset_t) &tmp_model_name, sizeof(model_name)) != sizeof(model_name))
@@ -509,7 +510,7 @@ static void panic_display_model_name(void) {
                kdb_printf("System model name: %s\n", tmp_model_name);
 }
 
-static void panic_display_kernel_uuid(void) {
+void panic_display_kernel_uuid(void) {
        char tmp_kernel_uuid[sizeof(kernel_uuid_string)];
 
        if (ml_nofault_copy((vm_offset_t) &kernel_uuid_string, (vm_offset_t) &tmp_kernel_uuid, sizeof(kernel_uuid_string)) != sizeof(kernel_uuid_string))
@@ -628,6 +629,8 @@ __private_extern__ void panic_display_ecc_errors()
 #if CONFIG_ZLEAKS
 extern boolean_t       panic_include_ztrace;
 extern struct ztrace* top_ztrace;
+void panic_print_symbol_name(vm_address_t search);
+
 /*
  * Prints the backtrace most suspected of being a leaker, if we paniced in the zone allocator.
  * top_ztrace and panic_include_ztrace comes from osfmk/kern/zalloc.c
@@ -636,6 +639,9 @@ __private_extern__ void panic_display_ztrace(void)
 {
        if(panic_include_ztrace == TRUE) {
                unsigned int i = 0;
+               boolean_t keepsyms = FALSE;
+
+               PE_parse_boot_argn("keepsyms", &keepsyms, sizeof (keepsyms));
                struct ztrace top_ztrace_copy;
                
                /* Make sure not to trip another panic if there's something wrong with memory */
@@ -643,7 +649,11 @@ __private_extern__ void panic_display_ztrace(void)
                        kdb_printf("\nBacktrace suspected of leaking: (outstanding bytes: %lu)\n", (uintptr_t)top_ztrace_copy.zt_size);
                        /* Print the backtrace addresses */
                        for (i = 0; (i < top_ztrace_copy.zt_depth && i < MAX_ZTRACE_DEPTH) ; i++) {
-                               kdb_printf("%p\n", top_ztrace_copy.zt_stack[i]);
+                               kdb_printf("%p ", top_ztrace_copy.zt_stack[i]);
+                               if (keepsyms) {
+                                       panic_print_symbol_name((vm_address_t)top_ztrace_copy.zt_stack[i]);
+                               }
+                               kdb_printf("\n");
                        }
                        /* Print any kexts in that backtrace, along with their link addresses so we can properly blame them */
                        kmod_panic_dump((vm_offset_t *)&top_ztrace_copy.zt_stack[0], top_ztrace_copy.zt_depth);
index 85acc47fd98b0a92826ead2d3e1dd415142deb60..407d4b4f28f985133ed8a5b260795b5f80914078 100644 (file)
@@ -299,6 +299,7 @@ extern unsigned int         disable_debug_output;
 
 extern unsigned int    panicDebugging;
 extern unsigned int    logPanicDataToScreen;
+extern unsigned int    kdebug_serial;
 
 extern int db_run_mode;
 
@@ -332,6 +333,8 @@ void        panic_display_system_configuration(void);
 void   panic_display_zprint(void);
 void   panic_display_kernel_aslr(void);
 void   panic_display_hibb(void);
+void   panic_display_model_name(void);
+void   panic_display_kernel_uuid(void);
 #if CONFIG_ZLEAKS
 void   panic_display_ztrace(void);
 #endif /* CONFIG_ZLEAKS */
@@ -359,7 +362,8 @@ void        panic_display_ecc_errors(void);
                                                * post-panic crashdump/paniclog
                                                * dump.
                                                */
-#define DB_NMI_BTN_ENA  0x8000 /* Enable button to directly trigger NMI */
+#define DB_NMI_BTN_ENA  0x8000  /* Enable button to directly trigger NMI */
+#define DB_PRT_KDEBUG   0x10000 /* kprintf KDEBUG traces */
 
 #if DEBUG
 /*
index 9d032d2d0362d8f323e8d758b4164e70aa4214c4..c60df98863f3de3d366d7c55dbc0e0c9150c4ad6 100644 (file)
@@ -44,6 +44,7 @@ int hv_support_available = 0;
 hv_callbacks_t hv_callbacks = {
        .dispatch = NULL,               /* thread is being dispatched for execution */
        .preempt = NULL,                /* thread is being preempted */
+       .suspend = NULL,                /* system is being suspended */
        .thread_destroy = NULL, /* thread is being destroyed */
        .task_destroy = NULL,   /* task is being destroyed */
        .volatile_state = NULL, /* thread state is becoming volatile */
@@ -142,7 +143,7 @@ hv_mp_notify(void) {
                        lck_mtx_unlock(hv_support_lck_mtx);
                        break;
                } else {
-                       hv_callbacks.memory_pressure(NULL);
+                       hv_callbacks.memory_pressure();
                }
                lck_mtx_unlock(hv_support_lck_mtx);
        }
@@ -244,6 +245,7 @@ hv_release_callbacks(void) {
        hv_callbacks = (hv_callbacks_t) {
                .dispatch = NULL,
                .preempt = NULL,
+               .suspend = NULL,
                .thread_destroy = NULL,
                .task_destroy = NULL,
                .volatile_state = NULL,
@@ -254,6 +256,14 @@ hv_release_callbacks(void) {
        lck_mtx_unlock(hv_support_lck_mtx);
 }
 
+/* system suspend notification */
+void
+hv_suspend(void) {
+       if (hv_callbacks_enabled) {
+               hv_callbacks.suspend();
+       }
+}
+
 /* dispatch hv_task_trap/hv_thread_trap syscalls to trap handlers,
    fail for invalid index or absence of trap handlers, trap handler is
    responsible for validating targets */
index 485654f707eb3442d81dd951dffab2c831a626eb..aaedb76ae5e0c604aba4014826cae3493a28d22c 100644 (file)
@@ -45,9 +45,7 @@ typedef enum {
        HV_THREAD_TRAP = 1
 } hv_trap_type_t;
 
-typedef kern_return_t (*hv_trap_t) (void *thread_target, uint64_t arg);
-typedef void (*hv_callback_0_t)(void *target);
-typedef void (*hv_callback_1_t)(void *target, int argument);
+typedef kern_return_t (*hv_trap_t) (void *target, uint64_t arg);
 
 typedef struct  {
        const hv_trap_t *traps;
@@ -55,12 +53,13 @@ typedef struct  {
 } hv_trap_table_t;
 
 typedef struct {
-       hv_callback_0_t dispatch;
-       hv_callback_0_t preempt;
-       hv_callback_0_t thread_destroy;
-       hv_callback_0_t task_destroy;
-       hv_callback_1_t volatile_state;
-       hv_callback_0_t memory_pressure;
+       void (*dispatch)(void *vcpu);
+       void (*preempt)(void *vcpu);
+       void (*suspend)(void);
+       void (*thread_destroy)(void *vcpu);
+       void (*task_destroy)(void *vm);
+       void (*volatile_state)(void *vcpu, int state);
+       void (*memory_pressure)(void);
 } hv_callbacks_t;
 
 extern hv_callbacks_t hv_callbacks;
@@ -79,7 +78,8 @@ extern kern_return_t hv_set_traps(hv_trap_type_t trap_type,
        const hv_trap_t *traps, unsigned trap_count);
 extern void hv_release_traps(hv_trap_type_t trap_type);
 extern kern_return_t hv_set_callbacks(hv_callbacks_t callbacks);
-extern void hv_release_callbacks(void) ;
+extern void hv_release_callbacks(void);
+extern void hv_suspend(void);
 extern kern_return_t hv_task_trap(uint64_t index, uint64_t arg);
 extern kern_return_t hv_thread_trap(uint64_t index, uint64_t arg);
 
index 725164c77c311d009a3af2106aaab9e63121e3fb..819c6c686fc583cb0d67d3d2d82763a8c5ba340f 100644 (file)
@@ -162,6 +162,7 @@ struct sfi_class_state {
        uint64_t        off_time_interval;
 
        timer_call_data_t       on_timer;
+       uint64_t        on_timer_deadline;
        boolean_t                       on_timer_programmed;
 
        boolean_t       class_sfi_is_enabled;
@@ -335,12 +336,15 @@ static void sfi_timer_global_off(
 
                        /* Push out on-timer */
                        on_timer_deadline = now + sfi_classes[i].off_time_interval;
+                       sfi_classes[i].on_timer_deadline = on_timer_deadline;
+
                        timer_call_enter1(&sfi_classes[i].on_timer, NULL, on_timer_deadline, TIMER_CALL_SYS_CRITICAL);
                } else {
                        /* If this class no longer needs SFI, make sure the timer is cancelled */
                        sfi_classes[i].class_in_on_phase = TRUE;
                        if (sfi_classes[i].on_timer_programmed) {
                                sfi_classes[i].on_timer_programmed = FALSE;
+                               sfi_classes[i].on_timer_deadline = ~0ULL;
                                timer_call_cancel(&sfi_classes[i].on_timer);
                        }
                }
@@ -420,7 +424,10 @@ static void sfi_timer_per_class_on(
         * Since we have the sfi_lock held and have changed "class_in_on_phase", we expect
         * no new threads to be put on this wait queue until the global "off timer" has fired.
         */
+
        sfi_class->class_in_on_phase = TRUE;
+       sfi_class->on_timer_programmed = FALSE;
+
        kret = wait_queue_wakeup64_all(&sfi_class->wait_queue,
                                                                   CAST_EVENT64_T(sfi_class_id),
                                                                   THREAD_AWAKENED);
@@ -532,6 +539,52 @@ kern_return_t sfi_window_cancel(void)
        return (KERN_SUCCESS);
 }
 
+/* Defers SFI off and per-class on timers (if live) by the specified interval
+ * in Mach Absolute Time Units. Currently invoked to align with the global
+ * forced idle mechanism. Making some simplifying assumptions, the iterative GFI
+ * induced SFI on+off deferrals form a geometric series that converges to yield
+ * an effective SFI duty cycle that is scaled by the GFI duty cycle. Initial phase
+ * alignment and congruency of the SFI/GFI periods can distort this to some extent.
+ */
+
+kern_return_t sfi_defer(uint64_t sfi_defer_matus)
+{
+       spl_t           s;
+       kern_return_t kr = KERN_FAILURE;
+       s = splsched();
+
+       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_GLOBAL_DEFER), sfi_defer_matus, 0, 0, 0, 0);
+
+       simple_lock(&sfi_lock);
+       if (!sfi_is_enabled) {
+               goto sfi_defer_done;
+       }
+
+       assert(sfi_next_off_deadline != 0);
+
+       sfi_next_off_deadline += sfi_defer_matus;
+       timer_call_enter1(&sfi_timer_call_entry, NULL, sfi_next_off_deadline, TIMER_CALL_SYS_CRITICAL);
+
+       int i;
+       for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
+               if (sfi_classes[i].class_sfi_is_enabled) {
+                       if (sfi_classes[i].on_timer_programmed) {
+                               uint64_t new_on_deadline = sfi_classes[i].on_timer_deadline + sfi_defer_matus;
+                               sfi_classes[i].on_timer_deadline = new_on_deadline;
+                               timer_call_enter1(&sfi_classes[i].on_timer, NULL, new_on_deadline, TIMER_CALL_SYS_CRITICAL);
+                       }
+               }
+       }
+
+       kr = KERN_SUCCESS;
+sfi_defer_done:
+       simple_unlock(&sfi_lock);
+
+       splx(s);
+
+       return (kr);
+}
+
 
 kern_return_t sfi_get_window(uint64_t *window_usecs)
 {
index 385b57cf01e522fcd923c6f7224cceb8954a8fbe..7ac6259b3e6ae96a44d89f3d3bfe30c23d5cac58 100644 (file)
@@ -64,6 +64,7 @@ ast_t sfi_processor_needs_ast(processor_t processor);
 
 void sfi_ast(thread_t thread);
 void sfi_reevaluate(thread_t thread);
+kern_return_t sfi_defer(uint64_t);
 #endif /* MACH_KERNEL_PRIVATE */
 
 #endif /* _KERN_SFI_H_ */
index e67c01a8b493e48e08a70ebb61ced5f02fef83af..6ddd0389c8526a9c1b6acf60efab0e16d3310ff3 100644 (file)
@@ -192,6 +192,7 @@ unsigned int new_nkdbufs = 0;
 unsigned int wake_nkdbufs = 0;
 unsigned int write_trace_on_panic = 0;
 unsigned int trace_typefilter = 0;
+boolean_t    trace_serial = FALSE;
 
 /* mach leak logging */
 int log_leaks = 0;
@@ -480,6 +481,11 @@ kernel_bootstrap_thread(void)
 #endif
 
 #if (defined(__i386__) || defined(__x86_64__))
+       if (kdebug_serial) {
+               new_nkdbufs = 1;
+               if (trace_typefilter == 0)
+                       trace_typefilter = 1;
+       }
        if (turn_on_log_leaks && !new_nkdbufs)
                new_nkdbufs = 200000;
        if (trace_typefilter)
index b9a7ae0ebe5a749c2d0e7902d507a0389069eadd..8b7d863b7a8e08f0ef4c867b4ce38e567009775d 100644 (file)
@@ -309,6 +309,12 @@ thread_bootstrap(void)
 #endif /* HYPERVISOR */
 
        thread_template.t_chud = 0;
+
+#if (DEVELOPMENT || DEBUG)
+       thread_template.t_page_creation_throttled_hard = 0;
+       thread_template.t_page_creation_throttled_soft = 0;
+#endif /* DEVELOPMENT || DEBUG */
+       thread_template.t_page_creation_throttled = 0;
        thread_template.t_page_creation_count = 0;
        thread_template.t_page_creation_time = 0;
 
@@ -663,7 +669,7 @@ void
 thread_terminate_enqueue(
        thread_t                thread)
 {
-       KERNEL_DEBUG_CONSTANT(TRACEDBG_CODE(DBG_TRACE_DATA, TRACE_DATA_THREAD_TERMINATE) | DBG_FUNC_NONE, thread->thread_id, 0, 0, 0, 0);
+       KERNEL_DEBUG_CONSTANT(TRACE_DATA_THREAD_TERMINATE | DBG_FUNC_NONE, thread->thread_id, 0, 0, 0, 0);
 
        simple_lock(&thread_terminate_lock);
        enqueue_tail(&thread_terminate_queue, (queue_entry_t)thread);
index a81bc6c8ddf44c2e6b13edb17aa739798edae589..0b5061a33a19a1536d4ece308967f3939c794055 100644 (file)
@@ -411,6 +411,12 @@ struct thread {
                clock_sec_t t_page_creation_time;
                uint32_t    t_page_creation_count;
 
+       uint32_t    t_page_creation_throttled;
+#if (DEVELOPMENT || DEBUG)
+       uint64_t    t_page_creation_throttled_hard;
+       uint64_t    t_page_creation_throttled_soft;
+#endif /* DEVELOPMENT || DEBUG */
+
 #define T_CHUD_MARKED           0x01          /* this thread is marked by CHUD */
 #define T_IN_CHUD               0x02          /* this thread is already in a CHUD handler */
 #define THREAD_PMC_FLAG         0x04          /* Bit in "t_chud" signifying PMC interest */    
index 7d3a98630b2b3186e37fa34c58c87979dd36b128..abd9115623dc8e2253ab069a1d59e9306c4a2c7c 100644 (file)
@@ -393,9 +393,7 @@ __END_DECLS
 #define CPUFAMILY_INTEL_SANDYBRIDGE    0x5490b78c
 #define CPUFAMILY_INTEL_IVYBRIDGE      0x1f65e835
 #define CPUFAMILY_INTEL_HASWELL                0x10b282dc
-#if !defined(XNU_HIDE_SEED)
 #define CPUFAMILY_INTEL_BROADWELL      0x582ed09c
-#endif /* not XNU_HIDE_SEED */
 #define CPUFAMILY_ARM_9                        0xe73283ae
 #define CPUFAMILY_ARM_11               0x8ff620d8
 #define CPUFAMILY_ARM_XSCALE           0x53b005f5
index 402323060ef29451c88902e2580bf64befda16a2..22485828281f6821bd7be4fcecb1fcf08ff6ccf5 100644 (file)
@@ -60,6 +60,7 @@ int           vm_compressor_mode = VM_PAGER_COMPRESSOR_WITH_SWAP;
 int            vm_scale = 16;
 
 
+int            vm_compressor_is_active = 0;
 int            vm_compression_limit = 0;
 
 extern boolean_t vm_swap_up;
@@ -464,6 +465,9 @@ vm_compressor_init(void)
                vm_compressor_swap_init();
        }
 
+       if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED)
+               vm_compressor_is_active = 1;
+
 #if CONFIG_FREEZE
        memorystatus_freeze_enabled = TRUE;
 #endif /* CONFIG_FREEZE */
@@ -764,9 +768,9 @@ void
 c_seg_free_locked(c_segment_t c_seg)
 {
        int             segno, i;
-       int             pages_populated;
+       int             pages_populated = 0;
        int32_t         *c_buffer = NULL;
-       uint64_t        c_swap_handle;
+       uint64_t        c_swap_handle = 0;
 
        assert(!c_seg->c_on_minorcompact_q);
 
@@ -1017,9 +1021,7 @@ struct {
 } c_seg_major_compact_stats;
 
 
-#define        C_MAJOR_COMPACTION_AGE_APPROPRIATE      30
-#define C_MAJOR_COMPACTION_OLD_ENOUGH          300
-#define C_MAJOR_COMPACTION_SIZE_APPROPRIATE    ((C_SEG_BUFSIZE * 80) / 100)
+#define C_MAJOR_COMPACTION_SIZE_APPROPRIATE    ((C_SEG_BUFSIZE * 90) / 100)
 
 
 boolean_t
@@ -2398,7 +2400,7 @@ static int
 c_compress_page(char *src, c_slot_mapping_t slot_ptr, c_segment_t *current_chead, char *scratch_buf)
 {
        int             c_size;
-       int             c_rounded_size;
+       int             c_rounded_size = 0;
        int             max_csize;
        c_slot_t        cs;
        c_segment_t     c_seg;
index e39ebf9b19bc8cc73cc0747a9a6b1d988e8f1d39..bb506ceddc0a48c4366688a158059f571d7e20e3 100644 (file)
@@ -135,11 +135,15 @@ uint64_t vm_hard_throttle_threshold;
 
 #define NEED_TO_HARD_THROTTLE_THIS_TASK()      (vm_wants_task_throttled(current_task()) ||     \
                                                 (vm_page_free_count < vm_page_throttle_limit && \
-                                                 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
+                                                 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) > THROTTLE_LEVEL_THROTTLED))
 
 
-#define HARD_THROTTLE_DELAY    20000   /* 20000 us == 20 ms */
-#define SOFT_THROTTLE_DELAY    2000    /* 2000 us == 2 ms */
+#define HARD_THROTTLE_DELAY    5000    /* 5000 us == 5 ms */
+#define SOFT_THROTTLE_DELAY    200     /* 200 us == .2 ms */
+
+#define        VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
+#define        VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
+
 
 boolean_t current_thread_aborted(void);
 
@@ -544,8 +548,13 @@ vm_fault_deactivate_behind(
 }
 
 
+#if (DEVELOPMENT || DEBUG)
+uint32_t       vm_page_creation_throttled_hard = 0;
+uint32_t       vm_page_creation_throttled_soft = 0;
+#endif /* DEVELOPMENT || DEBUG */
+
 static int
-vm_page_throttled(void)
+vm_page_throttled(boolean_t page_kept)
 {
         clock_sec_t     elapsed_sec;
         clock_sec_t     tv_sec;
@@ -556,21 +565,31 @@ vm_page_throttled(void)
        if (thread->options & TH_OPT_VMPRIV)
                return (0);
 
-       thread->t_page_creation_count++;
-
-       if (NEED_TO_HARD_THROTTLE_THIS_TASK())
+       if (thread->t_page_creation_throttled) {
+               thread->t_page_creation_throttled = 0;
+               
+               if (page_kept == FALSE)
+                       goto no_throttle;
+       }
+       if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
+#if (DEVELOPMENT || DEBUG)
+               thread->t_page_creation_throttled_hard++;
+               OSAddAtomic(1, &vm_page_creation_throttled_hard);
+#endif /* DEVELOPMENT || DEBUG */
                return (HARD_THROTTLE_DELAY);
+       }
 
        if ((vm_page_free_count < vm_page_throttle_limit || ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
-           thread->t_page_creation_count > vm_page_creation_throttle) {
+           thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
                
                clock_get_system_microtime(&tv_sec, &tv_usec);
 
                elapsed_sec = tv_sec - thread->t_page_creation_time;
 
-               if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
+               if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
+                   (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
 
-                       if (elapsed_sec >= 60) {
+                       if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
                                /*
                                 * we'll reset our stats to give a well behaved app
                                 * that was unlucky enough to accumulate a bunch of pages
@@ -581,22 +600,35 @@ vm_page_throttled(void)
                                 * will remain in the throttled state
                                 */
                                thread->t_page_creation_time = tv_sec;
-                               thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
+                               thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
                        }
                        ++vm_page_throttle_count;
 
-                       if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED())
+                       thread->t_page_creation_throttled = 1;
+
+                       if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED()) {
+#if (DEVELOPMENT || DEBUG)
+                               thread->t_page_creation_throttled_hard++;
+                               OSAddAtomic(1, &vm_page_creation_throttled_hard);
+#endif /* DEVELOPMENT || DEBUG */
                                return (HARD_THROTTLE_DELAY);
-                       else
+                       } else {
+#if (DEVELOPMENT || DEBUG)
+                               thread->t_page_creation_throttled_soft++;
+                               OSAddAtomic(1, &vm_page_creation_throttled_soft);
+#endif /* DEVELOPMENT || DEBUG */
                                return (SOFT_THROTTLE_DELAY);
+                       }
                }
                thread->t_page_creation_time = tv_sec;
                thread->t_page_creation_count = 0;
        }
+no_throttle:
+       thread->t_page_creation_count++;
+
        return (0);
 }
 
-
 /*
  * check for various conditions that would
  * prevent us from creating a ZF page...
@@ -606,7 +638,7 @@ vm_page_throttled(void)
  * object == m->object
  */
 static vm_fault_return_t
-vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
+vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state, boolean_t page_throttle)
 {
        int throttle_delay;
 
@@ -647,7 +679,7 @@ vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t int
                        return (VM_FAULT_RETRY);
                }
        }
-       if ((throttle_delay = vm_page_throttled())) {
+       if (page_throttle == TRUE && (throttle_delay = vm_page_throttled(FALSE))) {
                /*
                 * we're throttling zero-fills...
                 * treat this as if we couldn't grab a page
@@ -1150,7 +1182,7 @@ vm_fault_page(
                                         * fault cleanup in the case of an error condition
                                         * including resetting the thread_interrupt_level
                                         */
-                                       error = vm_fault_check(object, m, first_m, interruptible_state);
+                                       error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
 
                                        if (error != VM_FAULT_SUCCESS)
                                                return (error);
@@ -1560,6 +1592,21 @@ vm_fault_page(
                                        0,
                                        &compressed_count_delta);
 
+                               if (type_of_fault == NULL) {
+                                       int     throttle_delay;
+
+                                       /*
+                                        * we weren't called from vm_fault, so we
+                                        * need to apply page creation throttling
+                                        * do it before we re-acquire any locks
+                                        */
+                                       if (my_fault_type == DBG_COMPRESSOR_FAULT) {
+                                               if ((throttle_delay = vm_page_throttled(TRUE))) {
+                                                       VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
+                                                       delay(throttle_delay);
+                                               }
+                                       }
+                               }
                                vm_object_lock(object);
                                assert(object->paging_in_progress > 0);
 
@@ -1856,7 +1903,7 @@ dont_look_for_page:
                         * fault cleanup in the case of an error condition
                         * including resetting the thread_interrupt_level
                         */
-                       error = vm_fault_check(object, m, first_m, interruptible_state);
+                       error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
 
                        if (error != VM_FAULT_SUCCESS)
                                return (error);
@@ -3885,31 +3932,6 @@ FastPmapEnter:
                         */
                        assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
 
-                       if ((throttle_delay = vm_page_throttled())) {
-                               /*
-                                * drop all of our locks...
-                                * wait until the free queue is
-                                * pumped back up and then
-                                * redrive the fault
-                                */
-                               if (object != cur_object)
-                                       vm_object_unlock(cur_object);
-                               vm_object_unlock(object);
-                               vm_map_unlock_read(map);
-                               if (real_map != map)
-                                       vm_map_unlock(real_map);
-
-                               VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
-
-                               delay(throttle_delay);
-
-                               if (!current_thread_aborted() && vm_page_wait((change_wiring) ? 
-                                                THREAD_UNINT :
-                                                THREAD_ABORTSAFE))
-                                       goto RetryFault;
-                               kr = KERN_ABORTED;
-                               goto done;
-                       }
                         /*
                         * If objects match, then
                         * object->copy must not be NULL (else control
@@ -4268,31 +4290,6 @@ FastPmapEnter:
                                        kr = KERN_MEMORY_ERROR;
                                        goto done;
                                }
-                               if ((throttle_delay = vm_page_throttled())) {
-                                       /*
-                                        * drop all of our locks...
-                                        * wait until the free queue is
-                                        * pumped back up and then
-                                        * redrive the fault
-                                        */
-                                       if (object != cur_object)
-                                               vm_object_unlock(cur_object);
-                                       vm_object_unlock(object);
-                                       vm_map_unlock_read(map);
-                                       if (real_map != map)
-                                               vm_map_unlock(real_map);
-
-                                       VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
-
-                                       delay(throttle_delay);
-
-                                       if (!current_thread_aborted() && vm_page_wait((change_wiring) ? 
-                                                        THREAD_UNINT :
-                                                        THREAD_ABORTSAFE))
-                                               goto RetryFault;
-                                       kr = KERN_ABORTED;
-                                       goto done;
-                               }
                                if (vm_backing_store_low) {
                                        /*
                                         * we are protecting the system from
@@ -4829,12 +4826,27 @@ done:
        thread_interrupt_level(interruptible_state);
 
        /*
-        * Only throttle on faults which cause a pagein.
+        * Only I/O throttle on faults which cause a pagein/swapin.
         */
        if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
                throttle_lowpri_io(1);
-       }
+       } else {
+               if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
 
+                       if ((throttle_delay = vm_page_throttled(TRUE))) {
+
+                               if (vm_debug_events) {
+                                       if (type_of_fault == DBG_COMPRESSOR_FAULT)
+                                               VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+                                       else if (type_of_fault == DBG_COW_FAULT)
+                                               VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+                                       else
+                                               VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+                               }
+                               delay(throttle_delay);
+                       }
+               }
+       }
        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
                              (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
                              ((uint64_t)vaddr >> 32),
index a22b763c17218f25395b275c9bdd1d3d7f1dcf6f..ca11e1bae872bddae4311b2d0dedf3719ecfb5c3 100644 (file)
@@ -6199,6 +6199,7 @@ vm_map_copy_copy(
         */
 
        new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+       new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
        *new_copy = *copy;
 
        if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
@@ -6847,6 +6848,7 @@ start_overwrite:
                                /* destroyed after successful copy_overwrite */
                                copy = (vm_map_copy_t) 
                                        zalloc(vm_map_copy_zone);
+                               copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
                                vm_map_copy_first_entry(copy) =
                                        vm_map_copy_last_entry(copy) =
                                        vm_map_copy_to_entry(copy);
@@ -7150,6 +7152,7 @@ vm_map_copy_overwrite(
                 * Extract "head_copy" out of "copy".
                 */
                head_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+               head_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
                vm_map_copy_first_entry(head_copy) =
                        vm_map_copy_to_entry(head_copy);
                vm_map_copy_last_entry(head_copy) =
@@ -7191,6 +7194,7 @@ vm_map_copy_overwrite(
                 * Extract "tail_copy" out of "copy".
                 */
                tail_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+               tail_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
                vm_map_copy_first_entry(tail_copy) =
                        vm_map_copy_to_entry(tail_copy);
                vm_map_copy_last_entry(tail_copy) =
@@ -8657,6 +8661,7 @@ vm_map_copyin_common(
         */
 
        copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+       copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
        vm_map_copy_first_entry(copy) =
                vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy);
        copy->type = VM_MAP_COPY_ENTRY_LIST;
@@ -9392,6 +9397,7 @@ vm_map_copy_extract(
         */
 
        copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+       copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
        vm_map_copy_first_entry(copy) =
                vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy);
        copy->type = VM_MAP_COPY_ENTRY_LIST;
@@ -9443,6 +9449,7 @@ vm_map_copyin_object(
         */
 
        copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+       copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
        copy->type = VM_MAP_COPY_OBJECT;
        copy->cpy_object = object;
        copy->offset = offset;
index 3a2b381f0024ddef20e90bb89ad89c08d3719bd1..288bafba1d7c4e1818a68e369d9bec6712106dd7 100644 (file)
@@ -36,12 +36,23 @@ first_free_is_valid_store( vm_map_t map )
 }
 #endif
 
+boolean_t
+vm_map_store_has_RB_support( struct vm_map_header *hdr )
+{
+       if ((void*)hdr->rb_head_store.rbh_root == (void*)(int)SKIP_RB_TREE) {
+               return FALSE;
+       }
+       return TRUE;
+}
+
 void
 vm_map_store_init( struct vm_map_header *hdr )
 {
        vm_map_store_init_ll( hdr );
 #ifdef VM_MAP_STORE_USE_RB
-       vm_map_store_init_rb( hdr );
+       if (vm_map_store_has_RB_support( hdr )) {
+               vm_map_store_init_rb( hdr );
+       }
 #endif
 }
 
@@ -54,7 +65,12 @@ vm_map_store_lookup_entry(
 #ifdef VM_MAP_STORE_USE_LL
        return (vm_map_store_lookup_entry_ll( map, address, entry ));
 #elif defined VM_MAP_STORE_USE_RB
-       return (vm_map_store_lookup_entry_rb( map, address, entry ));
+       if (vm_map_store_has_RB_support( &map->hdr )) {
+               return (vm_map_store_lookup_entry_rb( map, address, entry ));
+       } else {
+               panic("VM map lookups need RB tree support.\n");
+               return FALSE; /* For compiler warning.*/
+       }
 #endif
 }
 
@@ -81,7 +97,9 @@ void  vm_map_store_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_
 {
        vm_map_store_copy_insert_ll(map, after_where, copy);
 #ifdef VM_MAP_STORE_USE_RB
-       vm_map_store_copy_insert_rb(map, after_where, copy);
+       if (vm_map_store_has_RB_support( &map->hdr )) {
+               vm_map_store_copy_insert_rb(map, after_where, copy);
+       }
 #endif
 }
 
@@ -104,7 +122,9 @@ _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_wh
        assert(entry->vme_start < entry->vme_end);
        vm_map_store_entry_link_ll(mapHdr, after_where, entry);
 #ifdef VM_MAP_STORE_USE_RB
-       vm_map_store_entry_link_rb(mapHdr, after_where, entry);
+       if (vm_map_store_has_RB_support( mapHdr )) {
+               vm_map_store_entry_link_rb(mapHdr, after_where, entry);
+       }
 #endif
 #if MAP_ENTRY_INSERTION_DEBUG
        fastbacktrace(&entry->vme_insertion_bt[0],
@@ -126,7 +146,9 @@ vm_map_store_entry_link( vm_map_t map, vm_map_entry_t after_where, vm_map_entry_
        } else {
                update_first_free_ll(VMEL_map, VMEL_map->first_free);
 #ifdef VM_MAP_STORE_USE_RB
-               update_first_free_rb(VMEL_map, VMEL_map->first_free);
+               if (vm_map_store_has_RB_support( &VMEL_map->hdr )) {
+                       update_first_free_rb(VMEL_map, VMEL_map->first_free);
+               }
 #endif
        }
 }
@@ -136,7 +158,9 @@ _vm_map_store_entry_unlink( struct vm_map_header * mapHdr, vm_map_entry_t entry)
 {
        vm_map_store_entry_unlink_ll(mapHdr, entry);
 #ifdef VM_MAP_STORE_USE_RB
-       vm_map_store_entry_unlink_rb(mapHdr, entry);
+       if (vm_map_store_has_RB_support( mapHdr )) {
+               vm_map_store_entry_unlink_rb(mapHdr, entry);
+       }
 #endif
 }
 
@@ -158,7 +182,9 @@ vm_map_store_entry_unlink( vm_map_t map, vm_map_entry_t entry)
        vm_map_store_update( map, entry, VM_MAP_ENTRY_DELETE);
        update_first_free_ll(VMEU_map, VMEU_first_free);
 #ifdef VM_MAP_STORE_USE_RB
-       update_first_free_rb(VMEU_map, VMEU_first_free);
+       if (vm_map_store_has_RB_support( &VMEU_map->hdr )) {
+               update_first_free_rb(VMEU_map, VMEU_first_free);
+       }
 #endif
 }
 
@@ -168,7 +194,9 @@ vm_map_store_copy_reset( vm_map_copy_t copy,vm_map_entry_t entry)
        int nentries = copy->cpy_hdr.nentries;
        vm_map_store_copy_reset_ll(copy, entry, nentries);
 #ifdef VM_MAP_STORE_USE_RB
-       vm_map_store_copy_reset_rb(copy, entry, nentries);
+       if (vm_map_store_has_RB_support( &copy->c_u.hdr )) {
+               vm_map_store_copy_reset_rb(copy, entry, nentries);
+       }
 #endif
 }
 
@@ -177,6 +205,8 @@ vm_map_store_update_first_free( vm_map_t map, vm_map_entry_t first_free)
 {
        update_first_free_ll(map, first_free);
 #ifdef VM_MAP_STORE_USE_RB
-       update_first_free_rb(map, first_free);
+       if (vm_map_store_has_RB_support( &map->hdr )) {
+               update_first_free_rb(map, first_free);
+       }
 #endif
 }
index dab7746ede3cf4ab1af84fa46fe85ffea6825c1c..b6c12fe19dcf261b9cb2ffe33b42a5f0be8eec6b 100644 (file)
@@ -114,6 +114,8 @@ struct vm_map_store {
        (map)->hint = (value);         \
        MACRO_END
 
+#define SKIP_RB_TREE           0xBAADC0D1
+
 #define VM_MAP_ENTRY_CREATE    1
 #define VM_MAP_ENTRY_DELETE    2
 
@@ -130,6 +132,7 @@ void        vm_map_store_copy_reset( struct vm_map_copy*, struct vm_map_entry*);
 #if MACH_ASSERT
 boolean_t first_free_is_valid_store( struct _vm_map*);
 #endif
+boolean_t vm_map_store_has_RB_support( struct vm_map_header *hdr );
 
 #endif /* _VM_VM_MAP_STORE_H */
 
index 7d78f66b28722d43bc77aea71a2060c61b341eb4..080ffb5e732cadf5cbecaae6a6ae64e12d7dbfe2 100644 (file)
@@ -1202,7 +1202,6 @@ struct flow_control {
 uint32_t vm_pageout_considered_page = 0;
 uint32_t vm_page_filecache_min = 0;
 
-#define        VM_PAGE_FILECACHE_MIN   50000
 #define ANONS_GRABBED_LIMIT    2
 
 /*
@@ -1664,6 +1663,16 @@ return_from_scan:
                if  (cache_evict_throttle)
                        cache_evict_throttle--;
 
+               /*
+                * don't let the filecache_min fall below 33% of available memory...
+                *
+                * on systems w/o the compressor/swapper, the filecache is always
+                * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
+                * since most (if not all) of the anonymous pages are in the
+                * throttled queue (which isn't counted as available) which
+                * effectively disables this filter
+                */
+               vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 3);
 
                exceeded_burst_throttle = FALSE;
                /*
@@ -1961,6 +1970,15 @@ consider_inactive:
                                        page_prev_state = PAGE_STATE_INACTIVE;
                                        anons_grabbed = 0;
 
+                                       if (vm_page_pageable_external_count < vm_page_filecache_min) {
+                                               if ((++reactivated_this_call % 100))
+                                                       goto must_activate_page;
+                                               /*
+                                                * steal 1% of the file backed pages even if
+                                                * we are under the limit that has been set
+                                                * for a healthy filecache
+                                                */
+                                       }
                                        break;
                                }
                        }
@@ -2407,6 +2425,7 @@ reactivate_page:
                                        vm_page_deactivate(m);
                                        vm_pageout_inactive_deactivated++;
                                } else {
+must_activate_page:
                                        /*
                                         * The page was/is being used, so put back on active list.
                                         */
@@ -2767,7 +2786,6 @@ vm_page_free_reserve(
                vm_page_free_target = vm_page_free_min + 5;
 
        vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 3);
-       vm_page_creation_throttle = vm_page_free_target * 3;
 }
 
 /*
@@ -3763,11 +3781,6 @@ void     vm_pageout_reinit_tuneables(void);
 void
 vm_pageout_reinit_tuneables(void)
 {
-       vm_page_filecache_min = (uint32_t) (max_mem / PAGE_SIZE) / 15;
-
-       if (vm_page_filecache_min < VM_PAGE_FILECACHE_MIN)
-               vm_page_filecache_min = VM_PAGE_FILECACHE_MIN;
-
        vm_compressor_minorcompact_threshold_divisor = 18;
        vm_compressor_majorcompact_threshold_divisor = 22;
        vm_compressor_unthrottle_threshold_divisor = 32;
@@ -3847,12 +3860,6 @@ vm_pageout(void)
        if (vm_pageout_burst_inactive_throttle == 0)
                vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
 
-#if !CONFIG_JETSAM
-       vm_page_filecache_min = (uint32_t) (max_mem / PAGE_SIZE) / 20;
-       if (vm_page_filecache_min < VM_PAGE_FILECACHE_MIN)
-               vm_page_filecache_min = VM_PAGE_FILECACHE_MIN;
-#endif
-
        /*
         * Set kernel task to low backing store privileged 
         * status
@@ -4314,11 +4321,10 @@ upl_set_decmp_info(upl_t upl, upl_t src_upl)
         }
         src_upl->decmp_io_upl = (void *)upl;
         src_upl->ref_count++;
-       upl_unlock(src_upl);
 
         upl->flags |= UPL_DECMP_REAL_IO;
         upl->decmp_io_upl = (void *)src_upl;
-
+       upl_unlock(src_upl);
 }
 #endif /* CONFIG_IOSCHED */  
 
index 513877c18947d4448c8670667d8608c6e64b8be4..9d81f507048eaf5d07f07b0a39adc7c19e42a4e1 100644 (file)
@@ -360,7 +360,6 @@ ppnum_t             max_valid_low_ppnum = 0xffffffff;
 unsigned int   vm_page_free_target = 0;
 unsigned int   vm_page_free_min = 0;
 unsigned int   vm_page_throttle_limit = 0;
-uint32_t       vm_page_creation_throttle = 0;
 unsigned int   vm_page_inactive_target = 0;
 unsigned int   vm_page_anonymous_min = 0;
 unsigned int   vm_page_inactive_min = 0;
@@ -5122,7 +5121,6 @@ hibernate_flush_queue(queue_head_t *q, int qcount)
 
                                        goto reenter_pg_on_q;
                                }
-                               vm_pageout_scan_wants_object = m_object;
 
                                vm_page_unlock_queues();
                                mutex_pause(try_failed_count++);
@@ -5132,7 +5130,6 @@ hibernate_flush_queue(queue_head_t *q, int qcount)
                                continue;
                        } else {
                                l_object = m_object;
-                               vm_pageout_scan_wants_object = VM_OBJECT_NULL;
                        }
                }
                if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error) {
@@ -5198,7 +5195,6 @@ hibernate_flush_queue(queue_head_t *q, int qcount)
                                vm_object_unlock(l_object);
                                l_object = NULL;
                        }
-                       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 
                        while (retval == 0) {
 
@@ -5271,7 +5267,6 @@ next_pg:
                vm_object_unlock(l_object);
                l_object = NULL;
        }
-       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 
        vm_page_unlock_queues();
 
index 6411418061d88e562eae15aeee01d98e3d2bc7a5..66a4dd7ac8e86b49671827af5c931aa60269feeb 100644 (file)
@@ -74,6 +74,60 @@ extern int _bcopystr(const void *, void *, vm_size_t, vm_size_t *);
 #define COPYINPHYS     3       /* from user virtual to kernel physical */
 #define COPYOUTPHYS    4       /* from kernel physical to user virtual */
 
+#if DEVELOPMENT
+typedef struct {
+       uint64_t        timestamp;
+       thread_t        thread;
+       uintptr_t       cr4;
+       uint8_t         cpuid;
+       uint8_t         smap_state;
+       uint8_t         copyio_active;
+} smaplog_entry_t;
+
+#define SMAPLOG_BUFFER_SIZE (50)
+static smaplog_entry_t smaplog_cbuf[SMAPLOG_BUFFER_SIZE];
+static uint32_t                smaplog_head = 0;
+
+static void
+smaplog_add_entry(boolean_t enabling)
+{
+       uint32_t index = 0;
+       thread_t thread = current_thread();
+
+       do {
+               index = smaplog_head;
+       } while (!OSCompareAndSwap(index, (index + 1) % SMAPLOG_BUFFER_SIZE, &smaplog_head));
+
+       assert(index < SMAPLOG_BUFFER_SIZE);
+       assert(smaplog_head < SMAPLOG_BUFFER_SIZE);
+       assert(thread);
+
+       smaplog_cbuf[index].timestamp = mach_absolute_time();
+       smaplog_cbuf[index].thread = thread;
+       smaplog_cbuf[index].cpuid = cpu_number();
+       smaplog_cbuf[index].cr4 = get_cr4();
+       smaplog_cbuf[index].smap_state = enabling;
+       smaplog_cbuf[index].copyio_active = (thread->machine.specFlags & CopyIOActive) ? 1 : 0;
+}
+#endif /* DEVELOPMENT */
+
+extern boolean_t pmap_smap_enabled;
+static inline void user_access_enable(void) {
+       if (pmap_smap_enabled) {
+               stac();
+#if DEVELOPMENT
+               smaplog_add_entry(TRUE);
+#endif
+       }
+}
+static inline void user_access_disable(void) {
+       if (pmap_smap_enabled) {
+               clac();
+#if DEVELOPMENT
+               smaplog_add_entry(FALSE);
+#endif
+       }
+}
 
 static int
 copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
@@ -123,6 +177,7 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
         */
        recursive_CopyIOActive = thread->machine.specFlags & CopyIOActive;
        thread->machine.specFlags |= CopyIOActive;
+       user_access_enable();
        if (no_shared_cr3) {
                istate = ml_set_interrupts_enabled(FALSE);
                if (get_cr3_base() != pmap->pm_cr3)
@@ -211,6 +266,7 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
                break;
        }
 
+       user_access_disable();
        if (!recursive_CopyIOActive) {
                thread->machine.specFlags &= ~CopyIOActive;
        }
index 8e1b559025c4e5ef95448b0070906cd573ad0d51..51b1b3348485e964ae2c34acffa21e73e9a5787a 100644 (file)
@@ -320,6 +320,13 @@ pmap_cpu_init(void)
                        pmap_smep_enabled = TRUE;
                }
        }
+       if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
+               boolean_t nsmap;
+               if (!PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
+                       set_cr4(get_cr4() | CR4_SMAP);
+                       pmap_smap_enabled = TRUE;
+               }
+       }
 
        if (cdp->cpu_fixed_pmcs_enabled) {
                boolean_t enable = TRUE;
@@ -448,6 +455,8 @@ pmap_bootstrap(
 
        if (pmap_smep_enabled)
                printf("PMAP: Supervisor Mode Execute Protection enabled\n");
+       if (pmap_smap_enabled)
+               printf("PMAP: Supervisor Mode Access Protection enabled\n");
 
 #if    DEBUG
        printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
index d79e4994a2b4fbbceecfa9e766865fd26ebf764b..ad6430635027355308985d39290fdd9cf27feb40 100644 (file)
@@ -175,7 +175,8 @@ typedef struct boot_args {
     uint32_t    pciConfigSpaceEndBusNumber;
     uint32_t   csrActiveConfig;
     uint32_t   csrPendingConfig;
-    uint32_t    __reserved4[728];
+    uint32_t    boot_SMC_plimit;
+    uint32_t    __reserved4[727];
 
 } boot_args;
 
index ad892f2aee065d935babf33ce2f6e64409d3bf34..8d1e8f460fa733a885f8f53e6965811bf3562e1a 100644 (file)
@@ -900,7 +900,7 @@ int access_chmod_fchmod_test( void * the_argp )
 
        char *          my_pathp = NULL;
 
-       uid_t           euid,ruid;
+       uid_t           ruid;
        struct stat     my_sb;
 
        FILE *          file_handle;
@@ -987,10 +987,13 @@ int access_chmod_fchmod_test( void * the_argp )
        file_handle = fopen(FILE_NOTME, "w");
        fclose(file_handle);
 
-       /* Currently running as root (through setreuid manipulation), switch to running as the current user. */
-       euid = geteuid();
+       /* Currently running as root (through settid manipulation), switch to running as the current user. */
        ruid = getuid();
-       setreuid(ruid, ruid);
+       my_err = syscall(SYS_settid, ruid, KAUTH_GID_NONE);
+       if (my_err != 0) {
+               printf("Failed to settid to non-root with error %d:%s\n", errno, strerror(errno));
+               goto test_failed_exit;
+       }
 
        /* Create a file that the current user owns  */
        file_handle = fopen(FILE_ME, "w");
@@ -1033,8 +1036,11 @@ int access_chmod_fchmod_test( void * the_argp )
        }
 
        /* Reset to running as root */
-       setreuid(ruid, euid);
-
+       my_err = syscall(SYS_settid, KAUTH_UID_NONE, KAUTH_GID_NONE);
+       if (my_err != 0) {
+               printf("Failed to revert to root using settid with error %d:%s\n", errno, strerror(errno));
+               goto test_failed_exit;
+       }
        if(error_occurred == 1) {
                goto test_failed_exit;
        }
@@ -5908,7 +5914,7 @@ int faccessat_fchmodat_fchmod_test( void * the_argp )
        char *          my_namep = NULL;
        char *          my_pathp = NULL;
 
-       uid_t           euid,ruid;
+       uid_t           ruid;
        struct stat     my_sb;
 
        FILE *          file_handle;
@@ -6044,10 +6050,13 @@ int faccessat_fchmodat_fchmod_test( void * the_argp )
        file_handle = fopen(FILE_NOTME, "w");
        fclose(file_handle);
 
-       /* Currently running as root (through setreuid manipulation), switch to running as the current user. */
-       euid = geteuid();
+       /* Currently running as root (through settid manipulation), switch to running as the current user. */
        ruid = getuid();
-       setreuid(ruid, ruid);
+       my_err = syscall(SYS_settid, ruid, KAUTH_GID_NONE);
+       if (my_err != 0) {
+               printf("Failed to settid to non-root with error %d:%s\n", errno, strerror(errno));
+               goto test_failed_exit;
+       }
 
        /* Create a file that the current user owns  */
        file_handle = fopen(FILE_ME, "w");
@@ -6090,7 +6099,11 @@ int faccessat_fchmodat_fchmod_test( void * the_argp )
        }
 
        /* Reset to running as root */
-       setreuid(ruid, euid);
+       my_err = syscall(SYS_settid, KAUTH_UID_NONE, KAUTH_GID_NONE);
+       if (my_err != 0) {
+               printf("Failed to settid revert to root with error %d:%s\n", errno, strerror(errno));
+               goto test_failed_exit;
+       }
 
        if(error_occurred == 1) {
                goto test_failed_exit;