xnu-2782.20.48.tar.gz

author Apple <opensource@apple.com>

Fri, 25 Sep 2015 15:59:39 +0000 (15:59 +0000)

committer Apple <opensource@apple.com>

Fri, 25 Sep 2015 15:59:39 +0000 (15:59 +0000)
author Apple <opensource@apple.com>
Fri, 25 Sep 2015 15:59:39 +0000 (15:59 +0000)
committer Apple <opensource@apple.com>
Fri, 25 Sep 2015 15:59:39 +0000 (15:59 +0000)
diff --git a/bsd/conf/files b/bsd/conf/files

index 54b4ef14d2cf5e85e963d0c761e0df4c643b0efb..d4ce218f8b52406760d772b9143d64f203dbd94d 100644 (file)
--- a/bsd/conf/files
+++ b/bsd/conf/files
@@ -381,6 +381,7 @@ bsd/hfs/hfs_cnode.c                         optional hfs
  bsd/hfs/hfs_encodinghint.c                     standard
  bsd/hfs/hfs_encodings.c                                standard
  bsd/hfs/hfs_endian.c                           optional hfs
+bsd/hfs/hfs_fsinfo.c                           optional hfs
  bsd/hfs/hfs_hotfiles.c                         optional hfs
  bsd/hfs/hfs_link.c                             optional hfs
  bsd/hfs/hfs_lookup.c                           optional hfs
diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c

index 25f6d7c8e4d5156f6da257955b7a8bc80540eae0..dd02ad5026d4b620eec8e57ce1ad157ce37152ca 100644 (file)
--- a/bsd/dev/dtrace/dtrace.c
+++ b/bsd/dev/dtrace/dtrace.c
@@ -20,7 +20,8 @@
   */
  
  /*
- * Portions copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Portions Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Portions Copyright (c) 2012 by Delphix. All rights reserved.
   */
  
  /*
@@ -2583,9 +2584,10 @@ dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
  {
         dtrace_speculation_t *spec;
         dtrace_buffer_t *src, *dest;
-       uintptr_t daddr, saddr, dlimit;
+       uintptr_t daddr, saddr, dlimit, slimit;
         dtrace_speculation_state_t current,  new = DTRACESPEC_INACTIVE;
         intptr_t offs;
+       uint64_t timestamp;
  
         if (which == 0)
                 return;
@@ -2661,7 +2663,38 @@ dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
         }
  
         /*
-        * We have the space; copy the buffer across.  (Note that this is a
+        * We have sufficient space to copy the speculative buffer into the
+        * primary buffer.  First, modify the speculative buffer, filling
+        * in the timestamp of all entries with the current time.  The data
+        * must have the commit() time rather than the time it was traced,
+        * so that all entries in the primary buffer are in timestamp order.
+        */
+       timestamp = dtrace_gethrtime();
+       saddr = (uintptr_t)src->dtb_tomax;
+       slimit = saddr + src->dtb_offset;
+       while (saddr < slimit) {
+               size_t size;
+               dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
+
+               if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
+                       saddr += sizeof (dtrace_epid_t);
+                       continue;
+               }
+
+               ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
+               size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
+
+               ASSERT(saddr + size <= slimit);
+               ASSERT(size >= sizeof(dtrace_rechdr_t));
+               ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
+
+               DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
+
+               saddr += size;
+       }
+
+       /*
+        * Copy the buffer across.  (Note that this is a
          * highly subobtimal bcopy(); in the unlikely event that this becomes
          * a serious performance issue, a high-performance DTrace-specific
          * bcopy() should obviously be invented.)
@@ -6119,8 +6152,23 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
                 tomax = buf->dtb_tomax;
                 ASSERT(tomax != NULL);
  
-               if (ecb->dte_size != 0)
-                       DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
+               /*
+                * Build and store the record header corresponding to the ECB.
+                */
+               if (ecb->dte_size != 0) {
+                       dtrace_rechdr_t dtrh;
+
+                       if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
+                               mstate.dtms_timestamp = dtrace_gethrtime();
+                               mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
+                       }
+
+                       ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
+
+                       dtrh.dtrh_epid = ecb->dte_epid;
+                       DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
+                       DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
+               }
  
                 mstate.dtms_epid = ecb->dte_epid;
                 mstate.dtms_present |= DTRACE_MSTATE_EPID;
@@ -6268,7 +6316,9 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
                                 continue;
  
                         switch (act->dta_kind) {
-                       case DTRACEACT_SPECULATE:
+                       case DTRACEACT_SPECULATE: {
+                               dtrace_rechdr_t *dtrh = NULL;
+
                                 ASSERT(buf == &state->dts_buffer[cpuid]);
                                 buf = dtrace_speculation_buffer(state,
                                     cpuid, val);
@@ -6291,9 +6341,23 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
                                 ASSERT(tomax != NULL);
  
                                 if (ecb->dte_size != 0)
-                                       DTRACE_STORE(uint32_t, tomax, offs,
-                                           ecb->dte_epid);
-                               continue;
+                                       continue;
+
+                               ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
+                               dtrh = ((void *)(tomax + offs));
+                               dtrh->dtrh_epid = ecb->dte_epid;
+
+                               /*
+                                * When the speculation is committed, all of
+                                * the records in the speculative buffer will
+                                * have their timestamps set to the commit
+                                * time.  Until then, it is set to a sentinel
+                                * value, for debugability.
+                                */
+                               DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
+
+                               continue;
+                       }
  
                         case DTRACEACT_CHILL:
                                 if (dtrace_priv_kernel_destructive(state))
@@ -9559,9 +9623,9 @@ dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
  
         /*
          * The default size is the size of the default action: recording
-        * the epid.
+        * the header.
          */
-       ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+       ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
         ecb->dte_alignment = sizeof (dtrace_epid_t);
  
         epid = state->dts_epid++;
@@ -9661,122 +9725,85 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb)
  static void
  dtrace_ecb_resize(dtrace_ecb_t *ecb)
  {
-       uint32_t maxalign = sizeof (dtrace_epid_t);
-       uint32_t align = sizeof (uint8_t), offs, diff;
         dtrace_action_t *act;
-       int wastuple = 0;
+       uint32_t curneeded = UINT32_MAX;
         uint32_t aggbase = UINT32_MAX;
-       dtrace_state_t *state = ecb->dte_state;
  
         /*
-        * If we record anything, we always record the epid.  (And we always
-        * record it first.)
+        * If we record anything, we always record the dtrace_rechdr_t.  (And
+        * we always record it first.)
          */
-       offs = sizeof (dtrace_epid_t);
-       ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+       ecb->dte_size = sizeof (dtrace_rechdr_t);
+       ecb->dte_alignment = sizeof (dtrace_epid_t);
  
         for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
                 dtrace_recdesc_t *rec = &act->dta_rec;
+               ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
  
-               if ((align = rec->dtrd_alignment) > maxalign)
-                       maxalign = align;
-
-               if (!wastuple && act->dta_intuple) {
-                       /*
-                        * This is the first record in a tuple.  Align the
-                        * offset to be at offset 4 in an 8-byte aligned
-                        * block.
-                        */
-                       diff = offs + sizeof (dtrace_aggid_t);
-
-                       if ((diff = (diff & (sizeof (uint64_t) - 1))))
-                               offs += sizeof (uint64_t) - diff;
-
-                       aggbase = offs - sizeof (dtrace_aggid_t);
-                       ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
-               }
-
-               /*LINTED*/
-               if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
-                       /*
-                        * The current offset is not properly aligned; align it.
-                        */
-                       offs += align - diff;
-               }
-
-               rec->dtrd_offset = offs;
-
-               if (offs + rec->dtrd_size > ecb->dte_needed) {
-                       ecb->dte_needed = offs + rec->dtrd_size;
-
-                       if (ecb->dte_needed > state->dts_needed)
-                               state->dts_needed = ecb->dte_needed;
-               }
+               ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
  
                 if (DTRACEACT_ISAGG(act->dta_kind)) {
                         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
-                       dtrace_action_t *first = agg->dtag_first, *prev;
  
-                       ASSERT(rec->dtrd_size != 0 && first != NULL);
-                       ASSERT(wastuple);
+                       ASSERT(rec->dtrd_size != 0);
+                       ASSERT(agg->dtag_first != NULL);
+                       ASSERT(act->dta_prev->dta_intuple);
                         ASSERT(aggbase != UINT32_MAX);
+                       ASSERT(curneeded != UINT32_MAX);
  
                         agg->dtag_base = aggbase;
  
-                       while ((prev = first->dta_prev) != NULL &&
-                           DTRACEACT_ISAGG(prev->dta_kind)) {
-                               agg = (dtrace_aggregation_t *)prev;
-                               first = agg->dtag_first;
-                       }
+                       curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+                       rec->dtrd_offset = curneeded;
+                       curneeded += rec->dtrd_size;
+                       ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
  
-                       if (prev != NULL) {
-                               offs = prev->dta_rec.dtrd_offset +
-                                   prev->dta_rec.dtrd_size;
-                       } else {
-                               offs = sizeof (dtrace_epid_t);
+                       aggbase = UINT32_MAX;
+                       curneeded = UINT32_MAX;
+               } else if (act->dta_intuple) {
+                       if (curneeded == UINT32_MAX) {
+                               /*
+                                * This is the first record in a tuple.  Align
+                                * curneeded to be at offset 4 in an 8-byte
+                                * aligned block.
+                                */
+                               ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
+                               ASSERT(aggbase == UINT32_MAX);
+
+                               curneeded = P2PHASEUP(ecb->dte_size,
+                                   sizeof (uint64_t), sizeof (dtrace_aggid_t));
+
+                               aggbase = curneeded - sizeof (dtrace_aggid_t);
+                               ASSERT(IS_P2ALIGNED(aggbase,
+                                   sizeof (uint64_t)));
                         }
-                       wastuple = 0;
-               } else {
-                       if (!act->dta_intuple)
-                               ecb->dte_size = offs + rec->dtrd_size;
  
-                       offs += rec->dtrd_size;
+                       curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+                       rec->dtrd_offset = curneeded;
+                       curneeded += rec->dtrd_size;
+               } else {
+                       /* tuples must be followed by an aggregation */
+                       ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
+                       ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
+                       rec->dtrd_offset = ecb->dte_size;
+                       ecb->dte_size += rec->dtrd_size;
+                       ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
                 }
-
-               wastuple = act->dta_intuple;
         }
  
         if ((act = ecb->dte_action) != NULL &&
             !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
-           ecb->dte_size == sizeof (dtrace_epid_t)) {
+           ecb->dte_size == sizeof (dtrace_rechdr_t)) {
                 /*
-                * If the size is still sizeof (dtrace_epid_t), then all
+                * If the size is still sizeof (dtrace_rechdr_t), then all
                  * actions store no data; set the size to 0.
                  */
-               ecb->dte_alignment = maxalign;
                 ecb->dte_size = 0;
-
-               /*
-                * If the needed space is still sizeof (dtrace_epid_t), then
-                * all actions need no additional space; set the needed
-                * size to 0.
-                */
-               if (ecb->dte_needed == sizeof (dtrace_epid_t))
-                       ecb->dte_needed = 0;
-
-               return;
         }
  
-       /*
-        * Set our alignment, and make sure that the dte_size and dte_needed
-        * are aligned to the size of an EPID.
-        */
-       ecb->dte_alignment = maxalign;
-       ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
-           ~(sizeof (dtrace_epid_t) - 1);
-       ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
-           ~(sizeof (dtrace_epid_t) - 1);
-       ASSERT(ecb->dte_size <= ecb->dte_needed);
+       ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
+       ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
+       ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
  }
  
  static dtrace_action_t *
@@ -10147,7 +10174,7 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
                         break;
  
                 case DTRACEACT_SPECULATE:
-                       if (ecb->dte_size > sizeof (dtrace_epid_t))
+                       if (ecb->dte_size > sizeof (dtrace_rechdr_t))
                                 return (EINVAL);
  
                         if (dp == NULL)
@@ -10260,7 +10287,7 @@ dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
  
         ecb->dte_action = NULL;
         ecb->dte_action_last = NULL;
-       ecb->dte_size = sizeof (dtrace_epid_t);
+       ecb->dte_size = 0;
  }
  
  static void
@@ -10534,11 +10561,13 @@ dtrace_buffer_switch(dtrace_buffer_t *buf)
         caddr_t tomax = buf->dtb_tomax;
         caddr_t xamot = buf->dtb_xamot;
         dtrace_icookie_t cookie;
+       hrtime_t now;
  
         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
         ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
  
         cookie = dtrace_interrupt_disable();
+       now = dtrace_gethrtime();
         buf->dtb_tomax = xamot;
         buf->dtb_xamot = tomax;
         buf->dtb_xamot_drops = buf->dtb_drops;
@@ -10549,6 +10578,8 @@ dtrace_buffer_switch(dtrace_buffer_t *buf)
         buf->dtb_drops = 0;
         buf->dtb_errors = 0;
         buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
+       buf->dtb_interval = now - buf->dtb_switched;
+       buf->dtb_switched = now;
         dtrace_interrupt_enable(cookie);
  }
  
@@ -16617,6 +16648,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                         desc.dtbd_drops = buf->dtb_drops;
                         desc.dtbd_errors = buf->dtb_errors;
                         desc.dtbd_oldest = buf->dtb_xamot_offset;
+                       desc.dtbd_timestamp = dtrace_gethrtime();
  
                         lck_mtx_unlock(&dtrace_lock);
  
@@ -16669,6 +16701,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                 desc.dtbd_drops = buf->dtb_xamot_drops;
                 desc.dtbd_errors = buf->dtb_xamot_errors;
                 desc.dtbd_oldest = 0;
+               desc.dtbd_timestamp = buf->dtb_switched;
  
                 lck_mtx_unlock(&dtrace_lock);
  
diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h

index 0f3771a2204fa3a687b0410eb309bc5eda6acab3..e3898a3a1a2ea824b63dfd9b2e72a60b3cfa5d32 100644 (file)
--- a/bsd/hfs/hfs.h
+++ b/bsd/hfs/hfs.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -106,6 +106,9 @@ extern struct timezone gTimeZone;
  /* How many free extents to cache per volume */
  #define kMaxFreeExtents                10
  
+/* The maximum time hfs locks can be held while performing hfs statistics gathering */
+#define HFS_FSINFO_MAX_LOCKHELD_TIME   20 * 1000000ULL /* at most 20 milliseconds. */
+
  /*
   * HFS_MINFREE gives the minimum acceptable percentage
   * of file system blocks which may be free (but this
@@ -715,20 +718,6 @@ extern int hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp
  
  extern int hfs_bmap(struct vnode *, daddr_t, struct vnode **, daddr64_t *, unsigned int *);
  
-extern int hfs_fsync(struct vnode *, int, int, struct proc *);
-
-extern int hfs_access(struct vnode *, mode_t, kauth_cred_t, struct proc *);
-
-extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid);
-
-extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state);
-
-extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks);
-
-extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, 
-               u_int32_t numBlocks, u_int32_t *alloc_count);
-
-extern int hfs_isrbtree_active (struct hfsmount *hfsmp);
  extern errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock);
  
  
@@ -904,6 +893,7 @@ extern int hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp,
  
  extern int hfs_update(struct vnode *, int);
  
+extern int hfs_fsync(struct vnode *, int, int, struct proc *);
  
  /*****************************************************************************
         Functions from hfs_xattr.c
@@ -929,6 +919,9 @@ int hfs_getxattr_internal(cnode_t *, struct vnop_getxattr_args *,
  int hfs_xattr_write(vnode_t vp, const char *name, const void *data, size_t size);
  int hfs_setxattr_internal(struct cnode *, const void *, size_t, 
                            struct vnop_setxattr_args *, struct hfsmount *, u_int32_t);
+extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid);
+extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state);
+
  
  
  /*****************************************************************************
@@ -951,6 +944,23 @@ extern cnid_t  hfs_currentparent(cnode_t *cp);
  extern cnid_t  hfs_currentcnid(cnode_t *cp);
  
  
+/*****************************************************************************
+       Functions from VolumeAllocation.c
+ ******************************************************************************/
+extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock,
+                                                  u_int32_t numBlocks);
+
+extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock,
+                                                          u_int32_t numBlocks, u_int32_t *alloc_count);
+
+extern int hfs_isrbtree_active (struct hfsmount *hfsmp);
+
+/*****************************************************************************
+       Functions from hfs_fsinfo.c
+ ******************************************************************************/
+extern errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data);
+extern void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry);
+
  #endif /* __APPLE_API_PRIVATE */
  #endif /* KERNEL */
  #endif /* __HFS__ */
diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c

index c2f92f02a4bfe97399ee9b43b196ea90195ceba5..89589de2855cb5e5105e57a2068131ba7f9f4c2f 100644 (file)
--- a/bsd/hfs/hfs_cnode.c
+++ b/bsd/hfs/hfs_cnode.c
@@ -2448,18 +2448,20 @@ hfs_unlock_truncate(struct cnode *cp, enum hfs_lockflags flags)
                 vnode_t vp = NULL, rvp = NULL;
  
                 /*
-                * Deal with any pending set sizes.  We need to call
-                * ubc_setsize before we drop the exclusive lock.  Ideally,
-                * hfs_unlock should be called before hfs_unlock_truncate but
-                * that's a lot to ask people to remember :-)
+                * If there are pending set sizes, the cnode lock should be dropped
+                * first.
                  */
+#if DEBUG
+               assert(!(cp->c_lockowner == thread
+                                && ISSET(cp->c_flag, C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE)));
+#elif DEVELOPMENT
                 if (cp->c_lockowner == thread
                         && ISSET(cp->c_flag, C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE)) {
-                       // hfs_unlock will do the setsize calls for us
-                       hfs_unlock(cp);
-                       hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
+                       printf("hfs: hfs_unlock_truncate called with C_NEED_DATA/RSRC_SETSIZE set (caller: 0x%llx)\n",
+                                  (uint64_t)VM_KERNEL_UNSLIDE(__builtin_return_address(0)));
                 }
- 
+#endif
+
                 if (cp->c_need_dvnode_put_after_truncate_unlock) {
                         vp = cp->c_vp;
                         cp->c_need_dvnode_put_after_truncate_unlock = false;
diff --git a/bsd/hfs/hfs_fsctl.h b/bsd/hfs/hfs_fsctl.h

index f7f3c26b19d3e50bbef7cf4d840e31d4bc799c96..b90b722b5ca74340fdc680b1ca1e07bde44b5e1f 100644 (file)
--- a/bsd/hfs/hfs_fsctl.h
+++ b/bsd/hfs/hfs_fsctl.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2004-2014 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2004-2015 Apple Computer, Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -54,6 +54,7 @@ struct hfs_journal_info {
  };
  
  
+// Will be deprecated and replaced by hfs_fsinfo
  struct hfsinfo_metadata {
         uint32_t total;
         uint32_t extents;
@@ -64,6 +65,189 @@ struct hfsinfo_metadata {
         uint32_t reserved[4];
  };
  
+/*
+ * Flags for hfs_fsinfo_data structure
+ */
+#define HFS_FSINFO_CLASS_A      0x0001 /* Information for class A files requested */
+#define HFS_FSINFO_CLASS_B      0x0002 /* Information for class B files requested */
+#define HFS_FSINFO_CLASS_C      0x0004 /* Information for class C files requested */
+#define HFS_FSINFO_CLASS_D      0x0008 /* Information for class D files requested */
+
+/*
+ * Maximum number of buckets to represent range from 0 to 1TB (2^40) in
+ * increments of power of 2, and one catch-all bucket for anything that
+ * is greater than 1TB
+ */
+#define HFS_FSINFO_DATA_MAX_BUCKETS     42
+
+/*
+ * Maximum number of buckets to represents percentage range from 0 to 100
+ * in increments of 10.
+ */
+#define HFS_FSINFO_PERCENT_MAX_BUCKETS  10
+
+/*
+ * Maximum number of buckets to represent number of file/directory name characters
+ * (range 1 to 255) in increments of 5.
+ */
+#define HFS_FSINFO_NAME_MAX_BUCKETS     51
+
+/*
+ * Version number to ensure that the caller and the kernel have same understanding
+ * of the hfs_fsinfo_data structure.  This version needs to be bumped whenever the
+ * number of buckets is changed.
+ */
+#define HFS_FSINFO_VERSION              1
+
+/*
+ * hfs_fsinfo_data is generic data structure to aggregate information like sizes
+ * or counts in buckets of power of 2.  Each bucket represents a range of values
+ * that is determined based on its index in the array.  Specifically, buckets[i]
+ * represents values that are greater than or equal to 2^(i-1) and less than 2^i,
+ * except the last bucket which represents range greater than or equal to 2^(i-1)
+ *
+ * The current maximum number of buckets is 41, so we can represent range from
+ * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of
+ * anything that is greater than or equal to 1TB.
+ *
+ * For example,
+ * bucket[0]  -> greater than or equal to 0 and less than 1
+ * bucket[1]  -> greater than or equal to 1 and less than 2
+ * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024
+ * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB
+ * bucket[41] -> greater than or equal to 2^(41-1) = 1TB
+ *
+ * Note that fsctls that populate this data structure can take long time to
+ * execute as this operation can be I/O intensive (traversing btrees) and compute
+ * intensive.
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+
+/* 
+ * The header includes the user input fields.
+ */
+typedef struct hfs_fsinfo_header {
+       uint32_t request_type;
+       uint16_t version;
+       uint16_t flags;
+} hfs_fsinfo_header_t;
+
+struct hfs_fsinfo_data {
+       hfs_fsinfo_header_t header;
+       uint32_t                        bucket[HFS_FSINFO_DATA_MAX_BUCKETS];
+};
+
+/*
+ * Structure to represent information about metadata files
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+struct hfs_fsinfo_metadata {
+       hfs_fsinfo_header_t header;
+       uint32_t                        extents;
+       uint32_t                        catalog;
+       uint32_t                        allocation;
+       uint32_t                        attribute;
+       uint32_t                        journal;
+};
+
+/*
+ * Structure to represent distribution of number of file name characters
+ * in increments of 5s.  Each bucket represents a range of values that is
+ * determined based on its index in the array.  So bucket[i] represents values
+ * that are greater than or equal to (i*5) and less than ((i+1)*10).
+ *
+ * Since this structure represents range of file name characters and the
+ * maximum number of unicode characters in HFS+ is 255, the maximum number
+ * of buckets will be 52 [0..51].
+ *
+ * For example,
+ * bucket[4] -> greater than or equal to 20 and less than 25 characters
+ * bucket[51] -> equal to 255 characters
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+struct hfs_fsinfo_name {
+       hfs_fsinfo_header_t     header;
+       uint32_t                        bucket[HFS_FSINFO_NAME_MAX_BUCKETS];
+};
+
+/*
+ * Structure to represent information about content protection classes
+ *
+ * WARNING: Any changes to this structure should also update version number to
+ * ensure that the clients and kernel are reading/writing correctly.
+ */
+struct hfs_fsinfo_cprotect {
+       hfs_fsinfo_header_t     header;
+       uint32_t class_A;
+       uint32_t class_B;
+       uint32_t class_C;
+       uint32_t class_D;
+       uint32_t class_E;
+       uint32_t class_F;
+};
+
+/*
+ * Union of all the different values returned by HFSIOC_FSINFO fsctl
+ */
+union hfs_fsinfo {
+       hfs_fsinfo_header_t                     header;
+       struct hfs_fsinfo_data          data;
+       struct hfs_fsinfo_metadata      metadata;
+       struct hfs_fsinfo_name          name;
+       struct hfs_fsinfo_cprotect cprotect;
+};
+typedef union hfs_fsinfo hfs_fsinfo;
+
+/*
+ * Type of FSINFO requested, specified by the caller in request_type field
+ */
+enum {
+       /* Information about number of allocation blocks for each metadata file, returns struct hfs_fsinfo_metadata */
+       HFS_FSINFO_METADATA_BLOCKS_INFO = 1,
+       
+       /* Information about number of extents for each metadata file, returns struct hfs_fsinfo_metadata */
+       HFS_FSINFO_METADATA_EXTENTS             = 2,
+       
+       /* Information about percentage of free nodes vs used nodes in metadata btrees, returns struct hfs_fsinfo_metadata */
+       HFS_FSINFO_METADATA_PERCENTFREE = 3,
+       
+       /* Distribution of number of extents for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */
+       HFS_FSINFO_FILE_EXTENT_COUNT    = 4,
+       
+       /* Distribution of extent sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */
+       HFS_FSINFO_FILE_EXTENT_SIZE             = 5,
+       
+       /* Distribution of file sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */
+       HFS_FSINFO_FILE_SIZE                    = 6,
+
+       /* Distribution of valence for all directories, returns struct hfs_fsinfo_data */
+       HFS_FSINFO_DIR_VALENCE                  = 7,
+       
+       /* Distribution of file/directory name size in unicode characters, returns struct hfs_fsinfo_name */
+       HFS_FSINFO_NAME_SIZE                    = 8,
+       
+       /* Distribution of extended attribute sizes, returns hfs_fsinfo_data */
+       HFS_FSINFO_XATTR_SIZE                   = 9,
+       
+       /* Distribution of free space for the entire file system, returns struct hfs_fsinfo_data */
+       HFS_FSINFO_FREE_EXTENTS                 = 10,
+
+       /* Information about number of files belonging to each class, returns hfs_fsinfo_cprotect */
+       HFS_FSINFO_FILE_CPROTECT_COUNT  = 11,
+
+       /*
+        * Distribution of symbolic link sizes for data files (data fork, no rsrc fork, no xattr),
+        * returns struct hfs_fsinfo_data
+        */
+       HFS_FSINFO_SYMLINK_SIZE                 = 12,
+};
+
  
  /* HFS FS CONTROL COMMANDS */
  
@@ -166,6 +350,8 @@ struct hfsinfo_metadata {
  
  
  /* 
+ * XXX: Will be deprecated and replaced by HFSIOC_GET_FSINFO
+ *
   * Get information about number of file system allocation blocks used by metadata 
   * files on the volume, including individual btrees and journal file.  The caller 
   * can determine the size of file system allocation block using value returned as 
@@ -178,6 +364,10 @@ struct hfsinfo_metadata {
  #define HFSIOC_CS_FREESPACE_TRIM _IOWR('h', 39, u_int32_t)
  #define HFS_CS_FREESPACE_TRIM    IOCBASECMD(HFSIOC_CS_FREESPACE_TRIM)
  
+/* Get file system information for the given volume */
+#define HFSIOC_GET_FSINFO        _IOWR('h', 45, hfs_fsinfo)
+#define HFS_GET_FSINFO           IOCBASECMD(HFSIOC_GET_FSINFO)
+
  #endif /* __APPLE_API_UNSTABLE */
  
  #endif /* ! _HFS_FSCTL_H_ */
diff --git a/bsd/hfs/hfs_fsinfo.c b/bsd/hfs/hfs_fsinfo.c

new file mode 100644 (file)

index 0000000..d307108
--- /dev/null
+++ b/bsd/hfs/hfs_fsinfo.c
@@ -0,0 +1,891 @@
+/*
+ * Copyright (c) 2014-2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <sys/cprotect.h>
+#include <sys/xattr.h>
+#include <sys/utfconv.h>
+#include <libkern/OSByteOrder.h>
+#include <kern/kalloc.h>
+#include <sys/stat.h>
+
+#include "hfs.h"
+#include "hfs_fsctl.h"
+#include "hfs_endian.h"
+#include "hfscommon/headers/BTreesInternal.h"
+#include "hfscommon/headers/BTreesPrivate.h"
+#include "hfscommon/headers/FileMgrInternal.h"
+
+#if CONFIG_PROTECT
+#include <hfs/hfs_cprotect.h>
+#endif
+
+
+union HFSPlusRecord {
+       HFSPlusCatalogFolder folder_record;
+       HFSPlusCatalogFile file_record;
+       HFSPlusCatalogThread thread_record;
+       HFSPlusExtentRecord extent_record;
+       HFSPlusAttrRecord attr_record;
+}; 
+typedef union HFSPlusRecord HFSPlusRecord;
+
+union HFSPlusKey {
+       HFSPlusExtentKey extent_key;
+       HFSPlusAttrKey attr_key;
+};
+typedef union HFSPlusKey HFSPlusKey;
+
+typedef enum traverse_btree_flag {
+       
+       //If set, extents btree will also be traversed along with catalog btree, so grab correct locks upfront
+       TRAVERSE_BTREE_EXTENTS = 1,
+
+       // Getting content-protection attributes, allocate enough space to accomodate the records.
+       TRAVERSE_BTREE_XATTR_CPROTECT = 2,
+       
+} traverse_btree_flag_t;
+
+
+
+static errno_t hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo);
+static errno_t hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo);
+static errno_t hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo);
+static errno_t fsinfo_file_extent_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_file_extent_size_catalog_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_file_extent_size_overflow_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_file_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_dir_valence_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_name_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t fsinfo_xattr_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+static errno_t traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, traverse_btree_flag_t flags, void *fsinfo,
+               int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *));
+static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo);
+static void fsinfo_free_extents_callback(void *data, off_t free_extent_size);
+#if CONFIG_PROTECT
+static errno_t fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+#endif
+static errno_t fsinfo_symlink_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data);
+
+/* 
+ * Entry function for all the fsinfo requests from hfs_vnop_ioctl() 
+ * Depending on the type of request, this function will call the 
+ * appropriate sub-function and return success or failure back to 
+ * the caller.
+ */
+__private_extern__
+errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data)
+{
+       int error = 0;
+       hfs_fsinfo *fsinfo_union;
+       uint32_t request_type;
+       uint32_t header_len = sizeof(hfs_fsinfo_header_t);
+
+       fsinfo_union = (hfs_fsinfo *)a_data;
+       request_type = fsinfo_union->header.request_type;
+
+       // Zero out output fields to fsinfo_union, keep the user input fields intact.
+       bzero((char *)fsinfo_union + header_len, sizeof(hfs_fsinfo) - header_len);
+
+       switch (request_type) {
+               case HFS_FSINFO_METADATA_BLOCKS_INFO:
+                       error = hfs_fsinfo_metadata_blocks(hfsmp, &(fsinfo_union->metadata));
+                       break;
+
+               case HFS_FSINFO_METADATA_EXTENTS:
+                       error = hfs_fsinfo_metadata_extents(hfsmp, &(fsinfo_union->metadata));
+                       break;
+
+               case HFS_FSINFO_METADATA_PERCENTFREE:
+                       error = hfs_fsinfo_metadata_percentfree(hfsmp, &(fsinfo_union->metadata));
+                       break;
+
+               case HFS_FSINFO_FILE_EXTENT_COUNT:
+                       /* Traverse catalog btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, TRAVERSE_BTREE_EXTENTS, &(fsinfo_union->data), fsinfo_file_extent_count_callback);
+                       break;
+
+               case HFS_FSINFO_FILE_EXTENT_SIZE:
+                       /* Traverse the catalog btree first */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_catalog_callback);
+                       if (error) {
+                               break;
+                       }
+                       /* Traverse the overflow extents btree now */
+                       error = traverse_btree(hfsmp, kHFSExtentsFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_overflow_callback);
+                       break;
+
+               case HFS_FSINFO_FILE_SIZE:
+                       /* Traverse catalog btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_size_callback);
+                       break;
+
+               case HFS_FSINFO_DIR_VALENCE:
+                       /* Traverse catalog btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_dir_valence_callback);
+                       break;
+
+               case HFS_FSINFO_NAME_SIZE:
+                       /* Traverse catalog btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->name), &fsinfo_name_size_callback);
+                       break;
+
+               case HFS_FSINFO_XATTR_SIZE:
+                       /* Traverse attribute btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSAttributesFileID, 0, &(fsinfo_union->data), &fsinfo_xattr_size_callback);
+                       break;
+
+               case HFS_FSINFO_FREE_EXTENTS:
+                       error = hfs_fsinfo_free_extents(hfsmp, &(fsinfo_union->data));
+                       break;
+
+               case HFS_FSINFO_SYMLINK_SIZE:
+                       /* Traverse catalog btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_symlink_size_callback);
+                       break;
+
+#if CONFIG_PROTECT
+               case HFS_FSINFO_FILE_CPROTECT_COUNT:
+                       /* Traverse attribute btree and invoke callback for all records */
+                       error = traverse_btree(hfsmp, kHFSAttributesFileID, TRAVERSE_BTREE_XATTR_CPROTECT, &(fsinfo_union->cprotect), &fsinfo_cprotect_count_callback);
+                       break;
+#endif
+
+               default:
+                       return ENOTSUP;
+       };
+
+       return error;
+}
+
+/* 
+ * This function provides information about total number of allocation blocks 
+ * for each individual metadata file.
+ */
+static errno_t
+hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo)
+{
+       int lockflags = 0;
+       int ret_lockflags = 0;
+
+       /* 
+        * Getting number of allocation blocks for all metadata files 
+        * should be a relatively quick operation, so we grab locks for all
+        * the btrees at the same time
+        */
+       lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE;
+       ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+
+       /* Get information about all the btrees */
+       fsinfo->extents    = hfsmp->hfs_extents_cp->c_datafork->ff_blocks;
+       fsinfo->catalog    = hfsmp->hfs_catalog_cp->c_datafork->ff_blocks;
+       fsinfo->allocation = hfsmp->hfs_allocation_cp->c_datafork->ff_blocks;
+       if (hfsmp->hfs_attribute_cp)
+               fsinfo->attribute  = hfsmp->hfs_attribute_cp->c_datafork->ff_blocks;
+       else
+               fsinfo->attribute = 0;
+
+       /* Done with btrees, give up the locks */
+       hfs_systemfile_unlock(hfsmp, ret_lockflags);
+
+       /* Get information about journal file */
+       fsinfo->journal = howmany(hfsmp->jnl_size, hfsmp->blockSize);
+
+       return 0;
+}
+
+/* 
+ * Helper function to count the number of valid extents in a file fork structure
+ */
+static uint32_t
+hfs_count_extents_fp(struct filefork *ff)
+{
+       int i;
+       uint32_t count = 0;
+       for (i = 0; i < kHFSPlusExtentDensity; i++) {
+               if (ff->ff_data.cf_extents[i].blockCount == 0) {
+                       break;
+               }
+               count++;
+       }
+       return count;
+}
+
+
+/* 
+ * This is a helper function that counts the total number of valid 
+ * extents in all the overflow extent records for given fileID 
+ * in overflow extents btree
+ */
+static errno_t
+hfs_count_overflow_extents(struct hfsmount *hfsmp, uint32_t fileID, uint32_t *num_extents)
+{
+       int error;
+       FCB *fcb;
+       struct BTreeIterator *iterator = NULL;
+       FSBufferDescriptor btdata;
+       HFSPlusExtentKey *extentKey;
+       HFSPlusExtentRecord extentData;
+       uint32_t extent_count = 0;
+       int i;
+
+       fcb = VTOF(hfsmp->hfs_extents_vp);
+       MALLOC(iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK | M_ZERO);
+       
+       extentKey = (HFSPlusExtentKey *) &iterator->key;        
+       extentKey->keyLength = kHFSPlusExtentKeyMaximumLength;
+       extentKey->forkType = kHFSDataForkType;
+       extentKey->fileID = fileID;
+       extentKey->startBlock = 0;
+
+       btdata.bufferAddress = &extentData;
+       btdata.itemSize = sizeof(HFSPlusExtentRecord);
+       btdata.itemCount = 1;
+
+       /* Search for overflow extent record */
+       error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator);
+       
+       /*
+        * We used startBlock of zero, so we will not find any records and errors
+        * are expected.  It will also position the iterator just before the first 
+        * overflow extent record for given fileID (if any). 
+        */
+       if (error && error != fsBTRecordNotFoundErr && error != fsBTEndOfIterationErr)
+                       goto out;
+       error = 0;
+
+       for (;;) {
+               
+               if (msleep(NULL, NULL, PINOD | PCATCH,
+                                  "hfs_fsinfo", NULL) == EINTR) {
+                       error = EINTR;
+                       break;
+               }
+               
+               error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
+               if (error != 0) {
+                       /* These are expected errors, so mask them */
+                       if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
+                               error = 0;
+                       }
+                       break;
+               }
+
+               /* If we encounter different fileID, stop the iteration */
+               if (extentKey->fileID != fileID) {
+                       break;
+               }
+               
+               if (extentKey->forkType != kHFSDataForkType)
+                       break;
+               
+               /* This is our record of interest; only count the datafork extents. */
+               for (i = 0; i < kHFSPlusExtentDensity; i++) {
+                       if (extentData[i].blockCount == 0) {
+                               break;
+                       }
+                       extent_count++;
+               }
+       }
+
+out:
+       FREE(iterator, M_TEMP);
+
+       if (error == 0) {
+               *num_extents = extent_count;
+       }
+       return MacToVFSError(error);
+}
+
+/*
+ * This function provides information about total number of extents (including 
+ * extents from overflow extents btree, if any) for each individual metadata 
+ * file.
+ */
+static errno_t
+hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo)
+{
+       int error = 0;
+       int lockflags = 0;
+       int ret_lockflags = 0;
+       uint32_t overflow_count;
+
+       /*
+        * Counting the number of extents for all metadata files should
+        * be a relatively quick operation, so we grab locks for all the
+        * btrees at the same time
+        */
+       lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE;
+       ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+
+       /* Get number of extents for extents overflow btree */
+       fsinfo->extents = hfs_count_extents_fp(hfsmp->hfs_extents_cp->c_datafork);
+
+       /* Get number of extents for catalog btree */
+       fsinfo->catalog = hfs_count_extents_fp(hfsmp->hfs_catalog_cp->c_datafork);
+       if (fsinfo->catalog >= kHFSPlusExtentDensity) {
+               error = hfs_count_overflow_extents(hfsmp, kHFSCatalogFileID, &overflow_count);
+               if (error) {
+                       goto out;
+               }
+               fsinfo->catalog += overflow_count;
+       }
+
+       /* Get number of extents for allocation file */
+       fsinfo->allocation = hfs_count_extents_fp(hfsmp->hfs_allocation_cp->c_datafork);
+       if (fsinfo->allocation >= kHFSPlusExtentDensity) {
+               error = hfs_count_overflow_extents(hfsmp, kHFSAllocationFileID, &overflow_count);
+               if (error) {
+                       goto out;
+               }
+               fsinfo->allocation += overflow_count;
+       }
+
+       /*
+        * Get number of extents for attribute btree.
+        *      hfs_attribute_cp might be NULL.
+        */
+       if (hfsmp->hfs_attribute_cp) {
+               fsinfo->attribute = hfs_count_extents_fp(hfsmp->hfs_attribute_cp->c_datafork);
+               if (fsinfo->attribute >= kHFSPlusExtentDensity) {
+                       error = hfs_count_overflow_extents(hfsmp, kHFSAttributesFileID, &overflow_count);
+                       if (error) {
+                               goto out;
+                       }
+                       fsinfo->attribute += overflow_count;
+               }
+       }
+       /* Journal always has one extent */
+       fsinfo->journal = 1;
+out:
+       hfs_systemfile_unlock(hfsmp, ret_lockflags);
+       return error;
+}
+
+/* 
+ * Helper function to calculate percentage i.e. X is what percent of Y?
+ */
+static inline uint32_t 
+hfs_percent(uint32_t X, uint32_t Y)
+{
+       return (X * 100ll) / Y;
+}
+
+/*
+ * This function provides percentage of free nodes vs total nodes for each 
+ * individual metadata btrees, i.e. for catalog, overflow extents and 
+ * attributes btree.  This information is not applicable for allocation 
+ * file and journal file.
+ */
+static errno_t
+hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo)
+{
+       int lockflags = 0;
+       int ret_lockflags = 0;
+       BTreeControlBlockPtr btreePtr;
+       uint32_t free_nodes, total_nodes;
+
+       /*
+        * Getting total and used nodes for all metadata btrees should 
+        * be a relatively quick operation, so we grab locks for all the
+        * btrees at the same time
+        */
+       lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE;
+       ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+       
+       /* Overflow extents btree */
+       btreePtr = VTOF(hfsmp->hfs_extents_vp)->fcbBTCBPtr;
+       total_nodes = btreePtr->totalNodes;
+       free_nodes = btreePtr->freeNodes;
+       fsinfo->extents = hfs_percent(free_nodes, total_nodes);
+
+       /* Catalog btree */
+       btreePtr = VTOF(hfsmp->hfs_catalog_vp)->fcbBTCBPtr;
+       total_nodes = btreePtr->totalNodes;
+       free_nodes = btreePtr->freeNodes;
+       fsinfo->catalog = hfs_percent(free_nodes, total_nodes);
+
+       /* Attributes btree */
+       if (hfsmp->hfs_attribute_vp) {
+               btreePtr = VTOF(hfsmp->hfs_attribute_vp)->fcbBTCBPtr;
+               total_nodes = btreePtr->totalNodes;
+               free_nodes = btreePtr->freeNodes;
+               fsinfo->attribute = hfs_percent(free_nodes, total_nodes);
+       }
+
+       hfs_systemfile_unlock(hfsmp, ret_lockflags);
+       return 0;
+}
+
+/* 
+ * Helper function to calculate log base 2 for given number 
+ */
+static inline int 
+hfs_log2(uint64_t entry) 
+{
+       return (63 - __builtin_clzll(entry|1));
+}
+
+/*
+ * Helper function to account for input entry into the data 
+ * array based on its log base 2 value
+ */
+__private_extern__
+void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry)
+{
+       /* 
+        * From hfs_fsctl.h - 
+        *
+        * hfs_fsinfo_data is generic data structure to aggregate information like sizes 
+        * or counts in buckets of power of 2.  Each bucket represents a range of values 
+        * that is determined based on its index in the array.  Specifically, buckets[i] 
+        * represents values that are greater than or equal to 2^(i-1) and less than 2^i, 
+        * except the last bucket which represents range greater than or equal to 2^(i-1)
+        *
+        * The current maximum number of buckets is 41, so we can represent range from
+        * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of 
+        * anything that is greater than or equal to 1TB.
+        *
+        * For example, 
+        * bucket[0]  -> greater than or equal to 0 and less than 1
+        * bucket[1]  -> greater than or equal to 1 and less than 2
+        * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024
+        * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB
+        * bucket[41] -> greater than or equal to 2^(41-1) = 1TB
+        */
+       uint32_t bucket;
+
+       if (entry) {
+               /* 
+                * Calculate log base 2 value for the entry.
+                * Account for this value in the appropriate bucket.
+                * The last bucket is a catch-all bucket of
+                * anything that is greater than or equal to 1TB
+                */
+               bucket = MIN(hfs_log2(entry) + 1, HFS_FSINFO_DATA_MAX_BUCKETS-1);
+               ++fsinfo->bucket[bucket];
+       } else {
+               /* Entry is zero, so account it in 0th offset */
+               fsinfo->bucket[0]++;
+       }
+}
+
+/* 
+ * Function to traverse all the records of a btree and then call caller-provided 
+ * callback function for every record found.  The type of btree is chosen based 
+ * on the fileID provided by the caller.  This fuction grabs the correct locks 
+ * depending on the type of btree it will be traversing and flags provided 
+ * by the caller.
+ *
+ * Note: It might drop and reacquire the locks during execution.
+ */
+static errno_t
+traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, traverse_btree_flag_t flags,
+                          void *fsinfo, int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *))
+{
+       int error = 0;
+       int lockflags = 0;
+       int ret_lockflags = 0;
+       FCB *fcb;
+       struct BTreeIterator *iterator = NULL;
+       struct FSBufferDescriptor btdata;
+       int btree_operation;
+       HFSPlusRecord record;
+       HFSPlusKey *key;
+       uint64_t start, timeout_abs;
+
+       switch(btree_fileID) {
+               case kHFSExtentsFileID: 
+                       fcb = VTOF(hfsmp->hfs_extents_vp);
+                       lockflags = SFL_EXTENTS;
+                       break;
+               case kHFSCatalogFileID:
+                       fcb = VTOF(hfsmp->hfs_catalog_vp);
+                       lockflags = SFL_CATALOG;
+                       break;
+               case kHFSAttributesFileID:
+                       // Attributes file doesn’t exist, There are no records to iterate.
+                       if (hfsmp->hfs_attribute_vp == NULL)
+                               return error;
+                       fcb = VTOF(hfsmp->hfs_attribute_vp);
+                       lockflags = SFL_ATTRIBUTE;
+                       break;
+
+               default:
+                       return EINVAL;
+       }
+
+       MALLOC(iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK | M_ZERO);
+
+       /* The key is initialized to zero because we are traversing entire btree */
+       key = (HFSPlusKey *)&iterator->key;
+
+       if (flags & TRAVERSE_BTREE_EXTENTS) {
+               lockflags |= SFL_EXTENTS;
+       }
+
+       btdata.bufferAddress = &record;
+       btdata.itemSize = sizeof(HFSPlusRecord);
+       btdata.itemCount = 1;
+
+       /* Lock btree for duration of traversal */
+       ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK);
+       btree_operation = kBTreeFirstRecord;
+
+       nanoseconds_to_absolutetime(HFS_FSINFO_MAX_LOCKHELD_TIME, &timeout_abs);
+       start = mach_absolute_time();
+
+       while (1) {
+
+               if (msleep(NULL, NULL, PINOD | PCATCH,
+                                  "hfs_fsinfo", NULL) == EINTR) {
+                       error = EINTR;
+                       break;
+               }
+
+               error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
+               if (error != 0) {
+                       if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
+                               error = 0;
+                       }
+                       break;
+               }
+               /* Lookup next btree record on next call to BTIterateRecord() */
+               btree_operation = kBTreeNextRecord;
+
+               /* Call our callback function and stop iteration if there are any errors */
+               error = callback(hfsmp, key, &record, fsinfo);
+               if (error) {
+                       break;
+               }
+
+               /* let someone else use the tree after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME */
+               if ((mach_absolute_time() - start) >= timeout_abs) {
+
+                       /* release b-tree locks and let someone else get the lock */
+                       hfs_systemfile_unlock (hfsmp, ret_lockflags);
+
+                       /* add tsleep here to force context switch and fairness */
+                       tsleep((caddr_t)hfsmp, PRIBIO, "hfs_fsinfo", 1);
+
+                       /*
+                        * re-acquire the locks in the same way that we wanted them originally.
+                        * note: it is subtle but worth pointing out that in between the time that we
+                        * released and now want to re-acquire these locks that the b-trees may have shifted
+                        * slightly but significantly. For example, the catalog or other b-tree could have grown
+                        * past 8 extents and now requires the extents lock to be held in order to be safely
+                        * manipulated. We can't be sure of the state of the b-tree from where we last left off.
+                        */
+
+                       ret_lockflags = hfs_systemfile_lock (hfsmp, lockflags, HFS_SHARED_LOCK);
+
+                       /*
+                        * It's highly likely that the search key we stashed away before dropping lock
+                        * no longer points to an existing item.  Iterator's IterateRecord is able to
+                        * re-position itself and process the next record correctly.  With lock dropped,
+                        * there might be records missed for statistic gathering, which is ok. The
+                        * point is to get aggregate values.
+                        */
+
+                       start = mach_absolute_time();
+
+                       /* loop back around and get another record */
+               }
+       }
+
+       hfs_systemfile_unlock(hfsmp, ret_lockflags);
+       FREE (iterator, M_TEMP);
+       return MacToVFSError(error);
+}
+
+/* 
+ * Callback function to get distribution of number of extents 
+ * for all user files in given file system.  Note that this only 
+ * accounts for data fork, no resource fork. 
+ */
+static errno_t
+fsinfo_file_extent_count_callback(struct hfsmount *hfsmp, 
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       int i;
+       int error = 0;
+       uint32_t num_extents = 0;
+       uint32_t num_overflow = 0;
+       uint32_t blockCount;
+
+       if (record->file_record.recordType == kHFSPlusFileRecord) {
+               /* Count total number of extents for this file */
+               for (i = 0; i < kHFSPlusExtentDensity; i++) {
+                       blockCount = record->file_record.dataFork.extents[i].blockCount;
+                       if (blockCount == 0) {
+                               break;
+                       }
+                       num_extents++;
+               }
+               /* This file has overflow extent records, so search overflow btree */
+               if (num_extents >= kHFSPlusExtentDensity) {
+                       /* The caller also hold extents overflow btree lock */
+                       error = hfs_count_overflow_extents(hfsmp, record->file_record.fileID, &num_overflow);
+                       if (error) {
+                               goto out;
+                       }
+                       num_extents += num_overflow;
+               }
+               hfs_fsinfo_data_add(data, num_extents);
+       }
+out:
+       return error;
+}
+
+/* 
+ * Callback function to get distribution of individual extent sizes
+ * (in bytes) for all user files in given file system from catalog 
+ * btree only.  Note that this only accounts for data fork, no resource 
+ * fork. 
+ */
+static errno_t fsinfo_file_extent_size_catalog_callback(__unused struct hfsmount *hfsmp,
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       int i;
+       uint32_t blockCount;
+       uint64_t extent_size;
+
+       if (record->file_record.recordType == kHFSPlusFileRecord) {
+               /* Traverse through all valid extents */
+               for (i = 0; i < kHFSPlusExtentDensity; i++) {
+                       blockCount = record->file_record.dataFork.extents[i].blockCount;
+                       if (blockCount == 0) {
+                               break;
+                       }
+                       extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize);
+                       hfs_fsinfo_data_add(data, extent_size);
+               }
+       }
+       return 0;
+}
+
+/* 
+ * Callback function to get distribution of individual extent sizes
+ * (in bytes) for all user files in given file system from overflow 
+ * extents btree only.  Note that this only accounts for data fork, 
+ * no resource fork. 
+ */
+static errno_t fsinfo_file_extent_size_overflow_callback(__unused struct hfsmount *hfsmp,
+               HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       int i;
+       uint32_t blockCount;
+       uint64_t extent_size;
+
+       if (key->extent_key.fileID >= kHFSFirstUserCatalogNodeID) {
+               // Only count the data fork extents.
+               if (key->extent_key.forkType == kHFSDataForkType) {
+                       for (i = 0; i < kHFSPlusExtentDensity; i++) {
+                               blockCount = record->extent_record[i].blockCount;
+                               if (blockCount == 0) {
+                                       break;
+                               }
+                               extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize);
+                               hfs_fsinfo_data_add(data, extent_size);
+                       }
+               }
+       }
+       return 0;
+}
+
+/* 
+ * Callback function to get distribution of file sizes (in bytes) 
+ * for all user files in given file system.  Note that this only 
+ * accounts for data fork, no resource fork. 
+ */
+static errno_t fsinfo_file_size_callback(__unused struct hfsmount *hfsmp,
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       if (record->file_record.recordType == kHFSPlusFileRecord) {
+               /* Record of interest, account for the size in the bucket */
+               hfs_fsinfo_data_add(data, record->file_record.dataFork.logicalSize);
+       }
+       return 0;
+}
+
+/*
+ * Callback function to get distribution of directory valence 
+ * for all directories in the given file system.
+ */
+static errno_t fsinfo_dir_valence_callback(__unused struct hfsmount *hfsmp,
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       if (record->folder_record.recordType == kHFSPlusFolderRecord) {
+               hfs_fsinfo_data_add(data, record->folder_record.valence);
+       }
+       return 0;
+}
+
+/* 
+ * Callback function to get distribution of number of unicode 
+ * characters in name for all files and directories for a given 
+ * file system.
+ */
+static errno_t fsinfo_name_size_callback(__unused struct hfsmount *hfsmp,
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       struct hfs_fsinfo_name *fsinfo = (struct hfs_fsinfo_name *)data;
+       uint32_t length;
+
+       if ((record->folder_record.recordType == kHFSPlusFolderThreadRecord) ||
+           (record->folder_record.recordType == kHFSPlusFileThreadRecord)) {
+               length = record->thread_record.nodeName.length;
+               /* Make sure that the nodeName is bounded, otherwise return error */
+               if (length > kHFSPlusMaxFileNameChars) {
+                       return EIO;
+               }
+               
+               // sanity check for a name length of zero, which isn't valid on disk.
+               if (length == 0)
+                       return EIO;
+               
+               /* Round it down to nearest multiple of 5 to match our buckets granularity */
+               length = (length - 1)/ 5;
+               /* Account this value into our bucket */
+               fsinfo->bucket[length]++;
+       }
+       return 0;
+}
+
+/* 
+ * Callback function to get distribution of size of all extended 
+ * attributes for a given file system.
+ */
+static errno_t fsinfo_xattr_size_callback(__unused struct hfsmount *hfsmp,
+               __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       if (record->attr_record.recordType == kHFSPlusAttrInlineData) {
+               /* Inline attribute */
+               hfs_fsinfo_data_add(data, record->attr_record.attrData.attrSize);
+       } else if (record->attr_record.recordType == kHFSPlusAttrForkData) {
+               /* Larger attributes with extents information */
+               hfs_fsinfo_data_add(data, record->attr_record.forkData.theFork.logicalSize);
+       }
+       return 0;
+}
+
+
+/*
+ * Callback function to get distribution of free space extents for a given file system.
+ */
+static void fsinfo_free_extents_callback(void *data, off_t free_extent_size)
+{
+       // Assume a minimum of 4 KB block size
+       hfs_fsinfo_data_add(data, free_extent_size / 4096);
+}
+
+/*
+ * Function to get distribution of free space extents for a given file system.
+ */
+static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo)
+{
+       return hfs_find_free_extents(hfsmp, &fsinfo_free_extents_callback, fsinfo);
+}
+
+/*
+ * Callback function to get distribution of symblock link sizes (in bytes)
+ * for all user files in given file system.  Note that this only
+ * accounts for data fork, no resource fork.
+ */
+static errno_t fsinfo_symlink_size_callback(__unused struct hfsmount *hfsmp,
+                                                                        __unused HFSPlusKey *key, HFSPlusRecord *record, void *data)
+{
+       if (record->file_record.recordType == kHFSPlusFileRecord) {
+               /* Record of interest, account for the size in the bucket */
+               if (S_ISLNK(record->file_record.bsdInfo.fileMode))
+                       hfs_fsinfo_data_add((struct hfs_fsinfo_data *)data, record->file_record.dataFork.logicalSize);
+       }
+       return 0;
+}
+
+#if CONFIG_PROTECT
+/*
+ * Callback function to get total number of files/directories
+ * for each content protection class
+ */
+static int fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key,
+                                                                                 HFSPlusRecord *record, void *data)
+{
+       struct hfs_fsinfo_cprotect *fsinfo = (struct hfs_fsinfo_cprotect *)data;
+       static const uint16_t cp_xattrname_utf16[] = CONTENT_PROTECTION_XATTR_NAME_CHARS;
+       static const size_t cp_xattrname_utf16_len = sizeof(cp_xattrname_utf16)/2;
+       struct cp_xattr_v5 *xattr;
+       size_t xattr_len = sizeof(struct cp_xattr_v5);
+       struct cprotect cp_entry;
+       struct cprotect *cp_entryp = &cp_entry;
+       int error = 0;
+
+       /* Content protect xattrs are inline attributes only, so skip all others */
+       if (record->attr_record.recordType != kHFSPlusAttrInlineData)
+               return 0;
+
+       /* We only look at content protection xattrs */
+       if ((key->attr_key.attrNameLen != cp_xattrname_utf16_len) ||
+               (bcmp(key->attr_key.attrName, cp_xattrname_utf16, cp_xattrname_utf16_len))) {
+               return 0;
+       }
+
+       xattr = (struct cp_xattr_v5 *)((void *)(record->attr_record.attrData.attrData));
+       error = cp_read_xattr_v5(hfsmp, xattr, xattr_len, (cprotect_t *)&cp_entryp,
+                                                        CP_GET_XATTR_BASIC_INFO);
+       if (error)
+               return 0;
+
+       /* No key present, skip this record */
+       if (!ISSET(cp_entry.cp_flags, CP_HAS_A_KEY))
+               return 0;
+
+       /* Now account for the persistent class */
+       switch (CP_CLASS(cp_entry.cp_pclass)) {
+               case PROTECTION_CLASS_A:
+                       fsinfo->class_A++;
+                       break;
+               case PROTECTION_CLASS_B:
+                       fsinfo->class_B++;
+                       break;
+               case PROTECTION_CLASS_C:
+                       fsinfo->class_C++;
+                       break;
+               case PROTECTION_CLASS_D:
+                       fsinfo->class_D++;
+                       break;
+               case PROTECTION_CLASS_E:
+                       fsinfo->class_E++;
+                       break;
+               case PROTECTION_CLASS_F:
+                       fsinfo->class_F++;
+                       break;
+       };
+
+       return 0;
+}
+#endif
diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c

index 96e8c20ed3fb2bb1835ebf0ab6542fe1a7f88b91..f09bdc7d2609867889becac6b21bbc0c6795b79c 100644 (file)
--- a/bsd/hfs/hfs_readwrite.c
+++ b/bsd/hfs/hfs_readwrite.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -1578,7 +1578,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
  int
  hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                 vnode_t a_vp;
-               int  a_command;
+               long  a_command;
                 caddr_t  a_data;
                 int  a_fflag;
                 vfs_context_t a_context;
@@ -2654,6 +2654,37 @@ fail_change_next_allocation:
                 break;
         }
  
+       case HFS_GET_FSINFO: {
+               hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
+
+               /* Only root is allowed to get fsinfo */
+               if (!kauth_cred_issuser(kauth_cred_get())) {
+                       return EACCES;
+               }
+
+               /*
+                * Make sure that the caller's version number matches with
+                * the kernel's version number.  This will make sure that
+                * if the structures being read/written into are changed
+                * by the kernel, the caller will not read incorrect data.
+                *
+                * The first three fields --- request_type, version and
+                * flags are same for all the hfs_fsinfo structures, so
+                * we can access the version number by assuming any
+                * structure for now.
+                */
+               if (fsinfo->header.version != HFS_FSINFO_VERSION) {
+                       return ENOTSUP;
+               }
+
+               /* Make sure that the current file system is not marked inconsistent */
+               if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
+                       return EIO;
+               }
+
+               return hfs_get_fsinfo(hfsmp, ap->a_data);
+       }
+
         case HFS_CS_FREESPACE_TRIM: {
                 int error = 0;
                 int lockflags = 0;
diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c

index 5bfc09c3e8f1678fd1367cf111c5f34251cdd952..0c327a7922f7f2e11e9695514b28590a47a43a46 100644 (file)
--- a/bsd/hfs/hfs_vnops.c
+++ b/bsd/hfs/hfs_vnops.c
@@ -3655,8 +3655,8 @@ relock:
          * truncate lock)
          */
  rm_done:
-       hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
         hfs_unlockpair(dcp, cp);
+       hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
  
         if (recycle_rsrc) {
                 /* inactive or reclaim on rvp will clean up the blocks from the rsrc fork */
@@ -5224,12 +5224,12 @@ out:
             wakeup((caddr_t)&tdcp->c_flag);
         }
  
+       hfs_unlockfour(fdcp, fcp, tdcp, tcp);
+
         if (took_trunc_lock) {
                 hfs_unlock_truncate(VTOC(tvp), HFS_LOCK_DEFAULT);       
         }
  
-       hfs_unlockfour(fdcp, fcp, tdcp, tcp);
-       
         /* Now vnode_put the resource forks vnodes if necessary */
         if (tvp_rsrc) {
                 vnode_put(tvp_rsrc);
diff --git a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c

index d53fd5fd50c59dff6656c55cea0ad4b7168944da..909ab5c1d2fa1fa061b4781110fffecf50b3a10b 100644 (file)
--- a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c
+++ b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c
@@ -642,7 +642,7 @@ static OSErr  MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest
                 
                 if ( i != kNumExtentsToCache )                  //      if the buffer is not full, we must be done
                 {
-                       err = DeleteExtents( vcb, srcFileID, forkType, quitEarly, isHFSPlus );  //      Now delete all the extent entries with the sourceID
+                       err = DeleteExtents( vcb, srcFileID, quitEarly, forkType, isHFSPlus );  //      Now delete all the extent entries with the sourceID
                         if ( DEBUG_BUILD && err != noErr )
                                 DebugStr("Error from DeleteExtents");
                         break;                                                                  //      we're done!
diff --git a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c

index b49cf439c6a0ab20b0551561b38b5e9084a1ff73..79547be7fbb7ff9837bbc9b32dde23560b9bb173 100644 (file)
--- a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c
+++ b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -155,31 +155,36 @@ Optimization Routines
                                         
  */
  
-#include "../../hfs_macos_defs.h"
-
  #include <sys/types.h>
  #include <sys/buf.h>
+
+
+#if !HFS_ALLOC_TEST
+
+#include "../../hfs_macos_defs.h"
  #include <sys/systm.h>
-#include <sys/sysctl.h>
-#include <sys/disk.h>
  #include <sys/ubc.h>
-#include <sys/uio.h>
  #include <kern/kalloc.h>
-#include <sys/malloc.h>
  
  /* For VM Page size */
  #include <libkern/libkern.h>
-
  #include "../../hfs.h"
-#include "../../hfs_dbg.h"
-#include "../../hfs_format.h"
  #include "../../hfs_endian.h"
-#include "../../hfs_macos_defs.h"
  #include "../headers/FileMgrInternal.h"
+#include <vfs/vfs_journal.h>
+
+#endif // !HFS_ALLOC_TEST
+
+#include <sys/sysctl.h>
+#include <sys/disk.h>
+#include <sys/uio.h>
+#include <sys/malloc.h>
+
+#include "../../hfs_dbg.h"
+#include "../../hfs_format.h"
  #include "../../hfs_kdebug.h"
  
  /* Headers for unmap-on-mount support */
-#include <vfs/vfs_journal.h>
  #include <sys/disk.h>
  
  #ifndef CONFIG_HFS_TRIM
@@ -357,6 +362,30 @@ static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBloc
  static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount);
  static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated);
  
+/* Functions for getting free exents */
+
+typedef struct bitmap_context {
+       void                    *bitmap;                                // current bitmap chunk
+       uint32_t                run_offset;                             // offset (in bits) from start of bitmap to start of current run
+       uint32_t                chunk_current;                  // next bit to scan in the chunk
+       uint32_t                chunk_end;                              // number of valid bits in this chunk
+       struct hfsmount *hfsmp;
+       struct buf              *bp;
+       uint32_t                last_free_summary_bit;  // last marked free summary bit
+       int                             lockflags;
+       uint64_t                lock_start;
+} bitmap_context_t;
+
+
+static errno_t get_more_bits(bitmap_context_t *bitmap_ctx);
+static int bit_count_set(void *bitmap, int start, int end);
+static int bit_count_clr(void *bitmap, int start, int end);
+static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count);
+static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count);
+static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count);
+static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set);
+static int clzll(uint64_t x);
+
  #if ALLOC_DEBUG
  /*
   * Validation Routine to verify that the TRIM list maintained by the journal
@@ -5153,3 +5182,462 @@ static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated)
         lck_spin_unlock(&hfsmp->vcbFreeExtLock);
  }
  
+#define BIT_RIGHT_MASK(bit)    (0xffffffffffffffffull >> (bit))
+#define kHighBitInDoubleWordMask 0x8000000000000000ull
+
+static int clzll(uint64_t x)
+{
+       if (x == 0)
+               return 64;
+       else
+               return __builtin_clzll(x);
+}
+
+#if !HFS_ALLOC_TEST
+
+static errno_t get_more_bits(bitmap_context_t *bitmap_ctx)
+{
+       uint32_t        start_bit;
+       uint32_t        iosize = 0;
+       uint32_t        byte_offset;
+       uint32_t        last_bitmap_block;
+       int                     error;
+       struct hfsmount *hfsmp = bitmap_ctx->hfsmp;
+#if !HFS_ALLOC_TEST
+       uint64_t        lock_elapsed;
+#endif
+
+
+       if (bitmap_ctx->bp)
+               ReleaseScanBitmapRange(bitmap_ctx->bp);
+       
+       if (msleep(NULL, NULL, PINOD | PCATCH,
+                          "hfs_fsinfo", NULL) == EINTR) {
+               return EINTR;
+       }
+
+#if !HFS_ALLOC_TEST
+       /*
+        * Let someone else use the allocation map after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME .
+        * lock_start is initialized in hfs_find_free_extents().
+        */
+       absolutetime_to_nanoseconds(mach_absolute_time() - bitmap_ctx->lock_start, &lock_elapsed);
+
+       if (lock_elapsed >= HFS_FSINFO_MAX_LOCKHELD_TIME) {
+
+               hfs_systemfile_unlock(hfsmp, bitmap_ctx->lockflags);
+               
+               /* add tsleep here to force context switch and fairness */
+               tsleep((caddr_t)get_more_bits, PRIBIO, "hfs_fsinfo", 1);
+
+               hfs_journal_lock(hfsmp);
+
+               /* Flush the journal and wait for all I/Os to finish up */
+               error = hfs_journal_flush(hfsmp, TRUE);
+               if (error) {
+                       hfs_journal_unlock(hfsmp);
+                       return error;
+               }
+
+               /*
+                * Take bitmap lock to ensure it is not being modified while journal is still held.
+                * Since we are reading larger than normal blocks from the bitmap, which
+                * might confuse other parts of the bitmap code using normal blocks, we
+                * take exclusive lock here.
+                */
+               bitmap_ctx->lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
+
+               bitmap_ctx->lock_start = mach_absolute_time();
+
+               /* Release the journal lock */
+               hfs_journal_unlock(hfsmp);
+
+               /*
+                * Bitmap is read in large block size (up to 1MB),
+                * unlike the runtime which reads the bitmap in the
+                * 4K block size.  If the bitmap is read by both ways
+                * at the same time, it can result in multiple buf_t with
+                * different sizes and potentially case data corruption.
+                * To avoid this, we invalidate all the existing buffers
+                * associated with the bitmap vnode.
+                */
+               error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
+               if (error) {
+                       /* hfs_systemfile_unlock will be called in the caller */
+                       return error;
+               }
+       }
+#endif
+
+       start_bit = bitmap_ctx->run_offset;
+
+       if (start_bit >= bitmap_ctx->hfsmp->totalBlocks) {
+               bitmap_ctx->chunk_end = 0;
+               bitmap_ctx->bp = NULL;
+               bitmap_ctx->bitmap = NULL;
+               return 0;
+       }
+
+       assert(start_bit % 8 == 0);
+
+       /*
+        * Compute how much I/O we should generate here.
+        * hfs_scan_range_size will validate that the start bit
+        * converted into a byte offset into the bitmap file,
+        * is aligned on a VBMIOSize boundary.
+        */
+       error = hfs_scan_range_size (bitmap_ctx->hfsmp, start_bit, &iosize);
+       if (error)
+               return error;
+
+       /* hfs_scan_range_size should have verified startbit.  Convert it to bytes */
+       byte_offset = start_bit / kBitsPerByte;
+
+       /*
+        * When the journal replays blocks, it does so by writing directly to the disk
+        * device (bypassing any filesystem vnodes and such).  When it finishes its I/Os
+        * it also immediately re-reads and invalidates the range covered by the bp so
+        * it does not leave anything lingering in the cache (for iosize reasons).
+        *
+        * As such, it is safe to do large I/Os here with ReadBitmapRange.
+        *
+        * NOTE: It is not recommended, but it is possible to call the function below
+        * on sections of the bitmap that may be in core already as long as the pages are not
+        * dirty.  In that case, we'd notice that something starting at that
+        * logical block of the bitmap exists in the metadata cache, and we'd check
+        * if the iosize requested is the same as what was already allocated for it.
+        * Odds are pretty good we're going to request something larger.  In that case,
+        * we just free the existing memory associated with the buf and reallocate a
+        * larger range. This function should immediately invalidate it as soon as we're
+        * done scanning, so this shouldn't cause any coherency issues.
+        */
+       error = ReadBitmapRange(bitmap_ctx->hfsmp, byte_offset, iosize, (uint32_t **)&bitmap_ctx->bitmap, &bitmap_ctx->bp);
+       if (error)
+               return error;
+
+       /*
+        * At this point, we have a giant wired buffer that represents some portion of
+        * the bitmap file that we want to analyze.   We may not have gotten all 'iosize'
+        * bytes though, so clip our ending bit to what we actually read in.
+        */
+       last_bitmap_block = start_bit + buf_count(bitmap_ctx->bp) * kBitsPerByte;
+
+       /* Cap the last block to the total number of blocks if required */
+       if (last_bitmap_block > bitmap_ctx->hfsmp->totalBlocks)
+               last_bitmap_block = bitmap_ctx->hfsmp->totalBlocks;
+
+       bitmap_ctx->chunk_current = 0;  // new chunk of bitmap
+       bitmap_ctx->chunk_end = last_bitmap_block - start_bit;
+
+       return 0;
+}
+
+#endif // !HFS_ALLOC_TEST
+
+// Returns number of contiguous bits set at start
+static int bit_count_set(void *bitmap, int start, int end)
+{
+       if (start == end)
+               return 0;
+
+       assert(end > start);
+
+       const int start_bit = start & 63;
+       const int end_bit   = end & 63;
+
+       uint64_t *p = (uint64_t *)bitmap + start / 64;
+       uint64_t x = ~OSSwapBigToHostInt64(*p);
+
+       if ((start & ~63) == (end & ~63)) {
+               // Start and end in same 64 bits
+               x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit);
+               return clzll(x) - start_bit;
+       }
+
+       // Deal with initial unaligned bit
+       x &= BIT_RIGHT_MASK(start_bit);
+
+       if (x)
+               return clzll(x) - start_bit;
+
+       // Go fast
+       ++p;
+       int count = 64 - start_bit;
+       int nquads = (end - end_bit - start - 1) / 64;
+
+       while (nquads--) {
+               if (*p != 0xffffffffffffffffull) {
+                       x = ~OSSwapBigToHostInt64(*p);
+                       return count + clzll(x);
+               }
+               ++p;
+               count += 64;
+       }
+
+       if (end_bit) {
+               x = ~OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit);
+               count += clzll(x);
+       }
+
+       return count;
+}
+
+/* Returns the number of a run of cleared bits:
+ *  bitmap is a single chunk of memory being examined
+ *  start: the start bit relative to the current buffer to be examined; start is inclusive.
+ *  end: the end bit relative to the current buffer to be examined; end is not inclusive.
+ */
+static int bit_count_clr(void *bitmap, int start, int end)
+{
+       if (start == end)
+               return 0;
+
+       assert(end > start);
+
+       const int start_bit = start & 63;
+       const int end_bit   = end & 63;
+
+       uint64_t *p = (uint64_t *)bitmap + start / 64;
+       uint64_t x = OSSwapBigToHostInt64(*p);
+
+       if ((start & ~63) == (end & ~63)) {
+               // Start and end in same 64 bits
+               x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit);
+
+               return clzll(x) - start_bit;
+       }
+
+       // Deal with initial unaligned bit
+       x &= BIT_RIGHT_MASK(start_bit);
+
+       if (x)
+               return clzll(x) - start_bit;
+
+       // Go fast
+       ++p;
+       int count = 64 - start_bit;
+       int nquads = (end - end_bit - start - 1) / 64;
+
+       while (nquads--) {
+               if (*p) {
+                       x = OSSwapBigToHostInt64(*p);
+                       return count + clzll(x);
+               }
+               ++p;
+               count += 64;
+       }
+
+       if (end_bit) {
+               x = OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit);
+
+               count += clzll(x);
+       }
+
+       return count;
+}
+
+#if !HFS_ALLOC_TEST
+static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set)
+{
+       uint32_t        end, start_summary_bit, end_summary_bit;
+       errno_t         error = 0;
+
+       if (count == 0)
+               goto out;
+
+       if (!ISSET(bitmap_ctx->hfsmp->hfs_flags, HFS_SUMMARY_TABLE))
+               return 0;
+
+       if (hfs_get_summary_index (bitmap_ctx->hfsmp, start, &start_summary_bit)) {
+               error = EINVAL;
+               goto out;
+       }
+
+       end = start + count - 1;
+       if (hfs_get_summary_index (bitmap_ctx->hfsmp, end, &end_summary_bit)) {
+               error = EINVAL;
+               goto out;
+       }
+
+       // if summary table bit has been updated with free block previously, leave it.
+       if ((start_summary_bit == bitmap_ctx->last_free_summary_bit) && set)
+               start_summary_bit++;
+
+       for (uint32_t summary_bit = start_summary_bit; summary_bit <= end_summary_bit; summary_bit++)
+               hfs_set_summary (bitmap_ctx->hfsmp, summary_bit, set);
+
+       if (!set)
+               bitmap_ctx->last_free_summary_bit = end_summary_bit;
+
+out:
+       return error;
+
+}
+#endif //!HFS_ALLOC_TEST
+
+/*
+ * Read in chunks of the bitmap into memory, and find a run of cleared/set bits;
+ * the run can extend across chunk boundaries.
+ * bit_count_clr can be passed to get a run of cleared bits.
+ * bit_count_set can be passed to get a run of set bits.
+ */
+static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count)
+{
+       int count;
+       errno_t error = 0;
+
+       *bit_count = 0;
+
+       do {
+               if (bitmap_ctx->run_offset == 0 || bitmap_ctx->chunk_current == bitmap_ctx->chunk_end) {
+                       if ((error = get_more_bits(bitmap_ctx)) != 0)
+                               goto out;
+               }
+
+               if (bitmap_ctx->chunk_end == 0)
+                       break;
+
+               count = fn(bitmap_ctx->bitmap, bitmap_ctx->chunk_current, bitmap_ctx->chunk_end);
+
+               bitmap_ctx->run_offset += count;
+               bitmap_ctx->chunk_current += count;
+               *bit_count += count;
+
+       } while (bitmap_ctx->chunk_current >= bitmap_ctx->chunk_end && count);
+
+out:
+       return error;
+
+}
+
+// Returns count of number of bits clear
+static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count)
+{
+       return hfs_bit_count(bitmap_ctx, bit_count_clr, count);
+}
+
+// Returns count of number of bits set
+static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count)
+{
+       return hfs_bit_count(bitmap_ctx, bit_count_set, count);
+}
+
+static uint32_t hfs_bit_offset(bitmap_context_t *bitmap_ctx)
+{
+       return bitmap_ctx->run_offset;
+}
+
+/*
+ * Perform a full scan of the bitmap file.
+ * Note: during the scan of bitmap file, it may drop and reacquire the
+ * bitmap lock to let someone else use the bitmap for fairness.
+ * Currently it is used by HFS_GET_FSINFO statistic gathing, which
+ * is run while other processes might perform HFS operations.
+ */
+
+errno_t hfs_find_free_extents(struct hfsmount *hfsmp,
+                                                         void (*callback)(void *data, off_t free_extent_size), void *callback_arg)
+{
+       struct bitmap_context bitmap_ctx;
+       uint32_t count;
+       errno_t error = 0;
+
+       if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
+               error = hfs_init_summary(hfsmp);
+               if (error)
+                       return error;
+       }
+
+       bzero(&bitmap_ctx, sizeof(struct bitmap_context));
+
+       /*
+        * The journal maintains list of recently deallocated blocks to
+        * issue DKIOCUNMAPs when the corresponding journal transaction is
+        * flushed to the disk.  To avoid any race conditions, we only
+        * want one active trim list.  Therefore we make sure that the
+        * journal trim list is sync'ed, empty, and not modifiable for
+        * the duration of our scan.
+        *
+        * Take the journal lock before flushing the journal to the disk.
+        * We will keep on holding the journal lock till we don't get the
+        * bitmap lock to make sure that no new journal transactions can
+        * start.  This will make sure that the journal trim list is not
+        * modified after the journal flush and before getting bitmap lock.
+        * We can release the journal lock after we acquire the bitmap
+        * lock as it will prevent any further block deallocations.
+        */
+       hfs_journal_lock(hfsmp);
+
+       /* Flush the journal and wait for all I/Os to finish up */
+       error = hfs_journal_flush(hfsmp, TRUE);
+       if (error) {
+               hfs_journal_unlock(hfsmp);
+               return error;
+       }
+
+       /*
+        * Take bitmap lock to ensure it is not being modified.
+        * Since we are reading larger than normal blocks from the bitmap, which
+        * might confuse other parts of the bitmap code using normal blocks, we
+        * take exclusive lock here.
+        */
+       bitmap_ctx.lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
+
+#if !HFS_ALLOC_TEST
+       bitmap_ctx.lock_start = mach_absolute_time();
+#endif
+
+       /* Release the journal lock */
+       hfs_journal_unlock(hfsmp);
+
+       /*
+        * Bitmap is read in large block size (up to 1MB),
+        * unlike the runtime which reads the bitmap in the
+        * 4K block size.  If the bitmap is read by both ways
+        * at the same time, it can result in multiple buf_t with
+        * different sizes and potentially case data corruption.
+        * To avoid this, we invalidate all the existing buffers
+        * associated with the bitmap vnode.
+        */
+       error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
+       if (error)
+               goto out;
+
+       /*
+        * Get the list of all free extent ranges.  hfs_alloc_scan_range()
+        * will call hfs_fsinfo_data_add() to account for all the free
+        * extent ranges found during scan.
+        */
+       bitmap_ctx.hfsmp = hfsmp;
+       bitmap_ctx.run_offset = 0;
+
+       while (bitmap_ctx.run_offset < hfsmp->totalBlocks) {
+
+               uint32_t start = hfs_bit_offset(&bitmap_ctx);
+
+               if ((error = hfs_bit_count_clr(&bitmap_ctx, &count)) != 0)
+                       goto out;
+
+               if (count)
+                       callback(callback_arg, hfs_blk_to_bytes(count, hfsmp->blockSize));
+
+               if ((error = update_summary_table(&bitmap_ctx, start, count, false)) != 0)
+                       goto out;
+
+               start = hfs_bit_offset(&bitmap_ctx);
+
+               if ((error = hfs_bit_count_set(&bitmap_ctx, &count)) != 0)
+                       goto out;
+
+               if ((error = update_summary_table(&bitmap_ctx, start, count, true)) != 0)
+                       goto out;
+       }
+
+out:
+       if (bitmap_ctx.lockflags) {
+               hfs_systemfile_unlock(hfsmp, bitmap_ctx.lockflags);
+       }
+
+       return error;
+}
+
diff --git a/bsd/hfs/hfscommon/headers/FileMgrInternal.h b/bsd/hfs/hfscommon/headers/FileMgrInternal.h

index 18e64caf43db9cdd382a360b38e6bb94831ec674..30eb8a84eafd7952b2f0aca8638654a4d5c1d6d9 100644 (file)
--- a/bsd/hfs/hfscommon/headers/FileMgrInternal.h
+++ b/bsd/hfs/hfscommon/headers/FileMgrInternal.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -46,11 +46,14 @@
  #include <sys/param.h>
  #include <sys/vnode.h>
  
+#if !HFS_ALLOC_TEST
+
  #include "../../hfs.h"
  #include "../../hfs_macos_defs.h"
  #include "../../hfs_format.h"
  #include "../../hfs_cnode.h"
  
+#endif
  
  #ifdef __cplusplus
  extern "C" {
@@ -255,6 +258,9 @@ ScanUnmapBlocks(struct hfsmount *hfsmp);
  EXTERN_API_C( int )
  hfs_init_summary (struct hfsmount *hfsmp);
  
+errno_t hfs_find_free_extents(struct hfsmount *hfsmp,
+                                                         void (*callback)(void *data, off_t), void *callback_arg);
+
  /*     File Extent Mapping routines*/
  EXTERN_API_C( OSErr )
  FlushExtentFile                                        (ExtendedVCB *                  vcb);
diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c

index 105922628243f3dfbbf1b2768570f21eeef29825..b344c7a9de873f9bc3c50fb7ce813e5976df4ceb 100644 (file)
--- a/bsd/kern/bsd_init.c
+++ b/bsd/kern/bsd_init.c
@@ -287,6 +287,7 @@ void bsd_utaskbootstrap(void);
  
  static void parse_bsd_args(void);
  extern task_t bsd_init_task;
+extern boolean_t init_task_died;
  extern char    init_task_failure_data[];
  #if CONFIG_DEV_KMEM
  extern void dev_kmem_init(void);
@@ -1013,6 +1014,7 @@ bsdinit_task(void)
         ut = (uthread_t)get_bsdthread_info(thread);
  
         bsd_init_task = get_threadtask(thread);
+       init_task_died = FALSE;
         init_task_failure_data[0] = 0;
  
  #if CONFIG_MACF
diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c

index 1b89e2d4aa3ad573c6e23a0436161446c60c355f..65c98080d0554a3c64b094ddd464793dee2ccc07 100644 (file)
--- a/bsd/kern/kdebug.c
+++ b/bsd/kern/kdebug.c
@@ -74,6 +74,14 @@
  
  #include <machine/pal_routines.h>
  
+extern boolean_t kdebug_serial;
+#if KDEBUG_MOJO_TRACE
+#include <sys/kdebugevents.h>
+static void kdebug_serial_print(       /* forward */
+               uint32_t, uint32_t, uint64_t,
+               uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+#endif
+
  /*
   * IOP(s)
   *
@@ -302,7 +310,6 @@ pid_t global_state_pid = -1;       /* Used to control exclusive use of kd_buffer
  #define MACH_SysCall   0x010c0000
  #define DBG_SCALL_MASK 0xffff0000
  
-
  /* task to string structure */
  struct tts
  {
@@ -392,7 +399,6 @@ kdbg_set_tracing_enabled(boolean_t enabled, uint32_t trace_type)
  {
         int s = ml_set_interrupts_enabled(FALSE);
         lck_spin_lock(kds_spin_lock);
-
         if (enabled) {
                 kdebug_enable |= trace_type;
                 kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG;
@@ -881,6 +887,12 @@ record_event:
         kdbp = &kdbip[coreid];
         timestamp &= KDBG_TIMESTAMP_MASK;
  
+#if KDEBUG_MOJO_TRACE
+       if (kdebug_enable & KDEBUG_ENABLE_SERIAL)
+               kdebug_serial_print(coreid, debugid, timestamp,
+                                   arg1, arg2, arg3, arg4, threadid);
+#endif
+
  retry_q:
         kds_raw = kdbp->kd_list_tail;
  
@@ -1057,6 +1069,14 @@ record_event:
  
         cpu = cpu_number();
         kdbp = &kdbip[cpu];
+
+#if KDEBUG_MOJO_TRACE
+       if (kdebug_enable & KDEBUG_ENABLE_SERIAL)
+               kdebug_serial_print(cpu, debugid,
+                                   mach_absolute_time() & KDBG_TIMESTAMP_MASK,
+                                   arg1, arg2, arg3, arg4, arg5);
+#endif
+
  retry_q:
         kds_raw = kdbp->kd_list_tail;
  
@@ -1168,7 +1188,7 @@ kernel_debug_string(const char *message)
         /* Stuff the message string in the args and log it. */
          strncpy((char *)arg, message, MIN(sizeof(arg), strlen(message)));
         KERNEL_DEBUG_EARLY(
-               (TRACEDBG_CODE(DBG_TRACE_INFO, 4)) | DBG_FUNC_NONE,
+               TRACE_INFO_STRING,
                 arg[0], arg[1], arg[2], arg[3]);
  }
  
@@ -1186,8 +1206,10 @@ kernel_debug_early(
         uintptr_t       arg4)
  {
         /* If tracing is already initialized, use it */
-       if (nkdbufs)
+       if (nkdbufs) {
                 KERNEL_DEBUG_CONSTANT(debugid, arg1, arg2, arg3, arg4, 0);
+               return;
+       }
  
         /* Do nothing if the buffer is full or we're not on the boot cpu */ 
         kd_early_overflow = kd_early_index >= KD_EARLY_BUFFER_MAX;
@@ -1206,7 +1228,7 @@ kernel_debug_early(
  }
  
  /*
- * Transfer the contents of the temporary buffer into the trace buffers.
+ * Transfen the contents of the temporary buffer into the trace buffers.
   * Precede that by logging the rebase time (offset) - the TSC-based time (in ns)
   * when mach_absolute_time is set to 0.
   */
@@ -1221,7 +1243,7 @@ kernel_debug_early_end(void)
         /* Fake sentinel marking the start of kernel time relative to TSC */
         kernel_debug_enter(
                 0,
-               (TRACEDBG_CODE(DBG_TRACE_INFO, 1)) | DBG_FUNC_NONE,
+               TRACE_TIMESTAMPS,
                 0,
                 (uint32_t)(tsc_rebase_abs_time >> 32),
                 (uint32_t)tsc_rebase_abs_time,
@@ -1243,7 +1265,7 @@ kernel_debug_early_end(void)
         /* Cut events-lost event on overflow */
         if (kd_early_overflow)
                 KERNEL_DEBUG_CONSTANT(
-                       TRACEDBG_CODE(DBG_TRACE_INFO, 2), 0, 0, 0, 0, 0);
+                       TRACE_LOST_EVENTS, 0, 0, 0, 0, 0);
  
         /* This trace marks the start of kernel tracing */
         kernel_debug_string("early trace done");
@@ -2453,9 +2475,9 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep)
                                 if (name[0] == KERN_KDWRITETR) {
                                         number = nkdbufs * sizeof(kd_buf);
  
-                                       KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_START, 0, 0, 0, 0, 0);
+                                       KERNEL_DEBUG_CONSTANT(TRACE_WRITING_EVENTS | DBG_FUNC_START, 0, 0, 0, 0, 0);
                                         ret = kdbg_read(0, &number, vp, &context);
-                                       KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_END, number, 0, 0, 0, 0);
+                                       KERNEL_DEBUG_CONSTANT(TRACE_WRITING_EVENTS | DBG_FUNC_END, number, 0, 0, 0, 0);
  
                                         *sizep = number;
                                 } else {
@@ -2635,7 +2657,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx)
                 return EINVAL;
  
         memset(&lostevent, 0, sizeof(lostevent));
-       lostevent.debugid = TRACEDBG_CODE(DBG_TRACE_INFO, 2);
+       lostevent.debugid = TRACE_LOST_EVENTS;
  
         /* Capture timestamp. Only sort events that have occured before the timestamp.
          * Since the iop is being flushed here, its possible that events occur on the AP
@@ -3107,7 +3129,11 @@ start_kern_tracing(unsigned int new_nkdbufs, boolean_t need_map)
                 /* Hold off interrupts until the early traces are cut */
                 boolean_t       s = ml_set_interrupts_enabled(FALSE);
  
-               kdbg_set_tracing_enabled(TRUE, KDEBUG_ENABLE_TRACE);
+               kdbg_set_tracing_enabled(
+                       TRUE,
+                       kdebug_serial ?
+                               (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_SERIAL) :
+                                KDEBUG_ENABLE_TRACE);
  
                 /*
                  * Transfer all very early events from the static buffer
@@ -3118,8 +3144,14 @@ start_kern_tracing(unsigned int new_nkdbufs, boolean_t need_map)
                 ml_set_interrupts_enabled(s);
  
                 printf("kernel tracing started\n");
+#if KDEBUG_MOJO_TRACE
+               if (kdebug_serial) {
+                       printf("serial output enabled with %lu named events\n",
+                       sizeof(kd_events)/sizeof(kd_event_t));
+               }
+#endif
         } else {
-               printf("error from kdbg_reinit,kernel tracing not started\n");
+               printf("error from kdbg_reinit, kernel tracing not started\n");
         }
  }
  
@@ -3167,7 +3199,7 @@ kdbg_dump_trace_to_file(const char *filename)
                         return;
                 }
         }
-       KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 0)) | DBG_FUNC_NONE, 0, 0, 0, 0, 0);
+       KERNEL_DEBUG_CONSTANT(TRACE_PANIC | DBG_FUNC_NONE, 0, 0, 0, 0, 0);
  
         kdebug_enable = 0;
         kd_ctrl_page.enabled = 0;
@@ -3209,3 +3241,146 @@ void kdbg_get_task_name(char* name_buf, int len, task_t task)
         else
                 snprintf(name_buf, len, "%p [!bsd]", task);
  }
+
+#if KDEBUG_MOJO_TRACE
+static kd_event_t *
+binary_search(uint32_t id)
+{
+       int low, high, mid;
+
+       low = 0;
+       high = sizeof(kd_events)/sizeof(kd_event_t) - 1;
+
+       while (TRUE)
+       {
+               mid = (low + high) / 2;
+
+               if (low > high)
+                       return NULL; /* failed */
+               else if ( low + 1 >= high) {
+                       /* We have a match */
+                       if (kd_events[high].id == id)
+                               return &kd_events[high];
+                       else if (kd_events[low].id == id)
+                               return &kd_events[low];
+                       else
+                               return NULL;  /* search failed */
+               }
+               else if (id < kd_events[mid].id)
+                       high = mid;
+               else
+                       low = mid;
+       } 
+}
+
+/*
+ * Look up event id to get name string.
+ * Using a per-cpu cache of a single entry
+ * before resorting to a binary search of the full table.
+ */
+#define        NCACHE  1
+static kd_event_t      *last_hit[MAX_CPUS];
+static kd_event_t *
+event_lookup_cache(uint32_t cpu, uint32_t id)
+{
+       if (last_hit[cpu] == NULL || last_hit[cpu]->id != id)
+               last_hit[cpu] = binary_search(id);
+       return last_hit[cpu];
+}
+
+static uint64_t        kd_last_timstamp;
+
+static void
+kdebug_serial_print(
+       uint32_t        cpunum,
+       uint32_t        debugid,
+       uint64_t        timestamp,
+       uintptr_t       arg1,
+       uintptr_t       arg2,
+       uintptr_t       arg3,
+       uintptr_t       arg4,
+       uintptr_t       threadid
+       )
+{
+       char            kprintf_line[192];
+       char            event[40];
+       uint64_t        us = timestamp / NSEC_PER_USEC;
+       uint64_t        us_tenth = (timestamp % NSEC_PER_USEC) / 100;
+       uint64_t        delta = timestamp - kd_last_timstamp;
+       uint64_t        delta_us = delta / NSEC_PER_USEC;
+       uint64_t        delta_us_tenth = (delta % NSEC_PER_USEC) / 100;
+       uint32_t        event_id = debugid & DBG_FUNC_MASK;
+       const char      *command;
+       const char      *bra;
+       const char      *ket;
+       kd_event_t      *ep;
+
+       /* event time and delta from last */
+       snprintf(kprintf_line, sizeof(kprintf_line),
+               "%11llu.%1llu %8llu.%1llu ",
+               us, us_tenth, delta_us, delta_us_tenth);
+
+
+       /* event (id or name) - start prefixed by "[", end postfixed by "]" */
+       bra = (debugid & DBG_FUNC_START) ? "[" : " ";
+       ket = (debugid & DBG_FUNC_END)   ? "]" : " ";
+       ep = event_lookup_cache(cpunum, event_id);
+       if (ep) {
+               if (strlen(ep->name) < sizeof(event) - 3)
+                       snprintf(event, sizeof(event), "%s%s%s",
+                                bra, ep->name, ket);
+               else
+                       snprintf(event, sizeof(event), "%s%x(name too long)%s",
+                                bra, event_id, ket);
+       } else {
+               snprintf(event, sizeof(event), "%s%x%s",
+                        bra, event_id, ket);
+       }
+       snprintf(kprintf_line + strlen(kprintf_line),
+                sizeof(kprintf_line) - strlen(kprintf_line),
+                "%-40s  ", event);
+
+       /* arg1 .. arg4 with special cases for strings */
+       switch (event_id) {
+           case VFS_LOOKUP:
+           case VFS_LOOKUP_DONE:
+               if (debugid & DBG_FUNC_START) {
+                       /* arg1 hex then arg2..arg4 chars */
+                       snprintf(kprintf_line + strlen(kprintf_line),
+                               sizeof(kprintf_line) - strlen(kprintf_line),
+                               "%-16lx %-8s%-8s%-8s                          ",
+                               arg1, (char*)&arg2, (char*)&arg3, (char*)&arg4);
+                       break;
+               }
+               /* else fall through for arg1..arg4 chars */
+           case TRACE_STRING_EXEC:
+           case TRACE_STRING_NEWTHREAD:
+           case TRACE_INFO_STRING:
+               snprintf(kprintf_line + strlen(kprintf_line),
+                       sizeof(kprintf_line) - strlen(kprintf_line),
+                       "%-8s%-8s%-8s%-8s                                   ",
+                       (char*)&arg1, (char*)&arg2, (char*)&arg3, (char*)&arg4);
+               break;
+           default:
+               snprintf(kprintf_line + strlen(kprintf_line),
+                       sizeof(kprintf_line) - strlen(kprintf_line),
+                       "%-16lx %-16lx %-16lx %-16lx",
+                       arg1, arg2, arg3, arg4);
+       }
+
+       /* threadid, cpu and command name */
+       if (threadid == (uintptr_t)thread_tid(current_thread()) &&
+           current_proc() &&
+           current_proc()->p_comm)
+               command = current_proc()->p_comm;
+       else
+               command = "-";
+       snprintf(kprintf_line + strlen(kprintf_line),
+               sizeof(kprintf_line) - strlen(kprintf_line),
+               "  %-16lx  %-2d %s\n",
+               threadid, cpunum, command);
+       
+       kprintf("%s", kprintf_line);
+       kd_last_timstamp = timestamp;
+}
+#endif
diff --git a/bsd/kern/kern_control.c b/bsd/kern/kern_control.c

index 16a66ae8293ceceee4c03f017e838c6f59b41a5e..8e5b0150bd8775b1dd5aac552cbc792e2df70012 100644 (file)
--- a/bsd/kern/kern_control.c
+++ b/bsd/kern/kern_control.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 1999-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   *
@@ -317,6 +317,8 @@ ctl_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
         struct sockaddr_ctl     sa;
         struct ctl_cb           *kcb = (struct ctl_cb *)so->so_pcb;
         struct ctl_cb           *kcb_next = NULL;
+       u_quad_t                sbmaxsize;
+       u_int32_t               recvbufsize, sendbufsize;
  
         if (kcb == 0)
                 panic("ctl_connect so_pcb null\n");
@@ -391,11 +393,27 @@ ctl_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
         kctlstat.kcs_connections++;
         lck_mtx_unlock(ctl_mtx);
  
-       error = soreserve(so, kctl->sendbufsize, kctl->recvbufsize);
+       /*
+        * rdar://15526688: Limit the send and receive sizes to sb_max
+        * by using the same scaling as sbreserve()
+        */
+       sbmaxsize = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
+
+       if (kctl->sendbufsize > sbmaxsize)
+               sendbufsize = sbmaxsize;
+       else
+               sendbufsize = kctl->sendbufsize;
+
+       if (kctl->recvbufsize > sbmaxsize)
+               recvbufsize = sbmaxsize;
+       else
+               recvbufsize = kctl->recvbufsize;
+
+       error = soreserve(so, sendbufsize, recvbufsize);
         if (error) {
                 printf("%s - soreserve(%llx, %u, %u) error %d\n", __func__,
                         (uint64_t)VM_KERNEL_ADDRPERM(so),
-                       kctl->sendbufsize, kctl->recvbufsize, error);
+                       sendbufsize, recvbufsize, error);
                 goto done;
         }
         soisconnecting(so);
@@ -631,7 +649,7 @@ ctl_rcvbspace(struct kctl *kctl, struct socket *so, u_int32_t datasize,
         struct sockbuf *sb = &so->so_rcv;
         u_int32_t space = sbspace(sb);
         errno_t error;
-       
+
         if ((kctl->flags & CTL_FLAG_REG_CRIT) == 0) {
                 if ((u_int32_t) space >= datasize)
                         error = 0;
@@ -1116,10 +1134,9 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref)
  {
         struct kctl     *kctl = NULL;
         struct kctl     *kctl_next = NULL;
-       u_int32_t               id = 1;
-       size_t                  name_len;
-       int                             is_extended = 0;
-       u_quad_t        sbmaxsize;
+       u_int32_t       id = 1;
+       size_t          name_len;
+       int             is_extended = 0;
  
         if (userkctl == NULL)   /* sanity check */
                 return (EINVAL);
@@ -1210,27 +1227,19 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref)
  
         /*
          * Let the caller know the default send and receive sizes
-        *
-        * rdar://15526688: Limit the send and receive sizes to sb_max
-        * by using the same scaling as sbreserve()
          */
-       sbmaxsize = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
-
-       if (userkctl->ctl_sendsize == 0)
+       if (userkctl->ctl_sendsize == 0) {
                 kctl->sendbufsize = CTL_SENDSIZE;
-       else if (userkctl->ctl_sendsize > sbmaxsize)
-               kctl->sendbufsize = sbmaxsize;
-       else
-       kctl->sendbufsize = userkctl->ctl_sendsize;
-       userkctl->ctl_sendsize = kctl->sendbufsize;
-
-       if (userkctl->ctl_recvsize == 0)
+               userkctl->ctl_sendsize = kctl->sendbufsize;
+       } else {
+               kctl->sendbufsize = userkctl->ctl_sendsize;
+       }
+       if (userkctl->ctl_recvsize == 0) {
                 kctl->recvbufsize = CTL_RECVSIZE;
-       else if (userkctl->ctl_recvsize > sbmaxsize)
-               kctl->recvbufsize = sbmaxsize;
-       else
-       kctl->recvbufsize = userkctl->ctl_recvsize;
-       userkctl->ctl_recvsize = kctl->recvbufsize;
+               userkctl->ctl_recvsize = kctl->recvbufsize;
+       } else {
+               kctl->recvbufsize = userkctl->ctl_recvsize;
+       }
  
         kctl->connect = userkctl->ctl_connect;
         kctl->disconnect = userkctl->ctl_disconnect;
diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c

index 3b16c4ca0e38d9bc788686bc006243ce529a3ba4..708aef4747ad4ddd1bbeef79d7c7fdf43cedaa9f 100644 (file)
--- a/bsd/kern/kern_event.c
+++ b/bsd/kern/kern_event.c
@@ -87,6 +87,7 @@
  #include <kern/clock.h>
  #include <kern/thread_call.h>
  #include <kern/sched_prim.h>
+#include <kern/wait_queue.h>
  #include <kern/zalloc.h>
  #include <kern/assert.h>
  
@@ -415,6 +416,7 @@ kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
         int oktodrop;
  
         oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
+       kn->kn_status &= ~KN_STAYQUEUED;
         kn->kn_status |= KN_DROPPING;
         if (oktodrop) {
                 if (kn->kn_inuse == 0) {
@@ -1180,6 +1182,7 @@ kqueue_alloc(struct proc *p)
                         kq->kq_p = p;
                 } else {
                         FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE);
+                       kq = NULL;
                 }
         }
  
@@ -2624,10 +2627,7 @@ knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link
         kern_return_t kr;
  
         kr = wait_queue_unlink_nofree(wq, kq->kq_wqs, wqlp);
-       kqlock(kq);
-       kn->kn_status &= ~KN_STAYQUEUED;
-       knote_dequeue(kn);
-       kqunlock(kq);
+       knote_clearstayqueued(kn);
         return ((kr != KERN_SUCCESS) ? EINVAL : 0);
  }
  
@@ -3517,3 +3517,12 @@ knote_markstayqueued(struct knote *kn)
         knote_enqueue(kn);
         kqunlock(kn->kn_kq);
  }
+
+void
+knote_clearstayqueued(struct knote *kn)
+{
+       kqlock(kn->kn_kq);
+       kn->kn_status &= ~KN_STAYQUEUED;
+       knote_dequeue(kn);
+       kqunlock(kn->kn_kq);
+}
diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c

index 3d2710538031d4d0903c33131eb019954f62193e..4816a4891c3ac3038a6ed24174bd6abf3039f343 100644 (file)
--- a/bsd/kern/kern_exec.c
+++ b/bsd/kern/kern_exec.c
@@ -642,6 +642,13 @@ exec_fat_imgact(struct image_params *imgp)
                 int nfat_arch = 0, pr = 0, f = 0;
  
                 nfat_arch = OSSwapBigToHostInt32(fat_header->nfat_arch);
+
+               /* make sure bogus nfat_arch doesn't cause chaos - 19376072 */
+               if ( (sizeof(struct fat_header) + (nfat_arch * sizeof(struct fat_arch))) > PAGE_SIZE ) {
+                       error = EBADEXEC;
+                       goto bad;
+               }
+
                 /* Check each preference listed against all arches in header */
                 for (pr = 0; pr < NBINPREFS; pr++) {
                         cpu_type_t pref = psa->psa_binprefs[pr];
@@ -1114,14 +1121,14 @@ grade:
                 kdbg_trace_string(p, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4);
  
                 if (vfexec || spawn) {
-                       KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE,
+                       KERNEL_DEBUG_CONSTANT1(TRACE_DATA_EXEC | DBG_FUNC_NONE,
                                         p->p_pid ,0,0,0, (uintptr_t)thread_tid(thread));
-                       KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE,
+                       KERNEL_DEBUG_CONSTANT1(TRACE_STRING_EXEC | DBG_FUNC_NONE,
                                         dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, (uintptr_t)thread_tid(thread));
                 } else {
-                       KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE,
+                       KERNEL_DEBUG_CONSTANT(TRACE_DATA_EXEC | DBG_FUNC_NONE,
                                         p->p_pid ,0,0,0,0);
-                       KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE,
+                       KERNEL_DEBUG_CONSTANT(TRACE_STRING_EXEC | DBG_FUNC_NONE,
                                         dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0);
                 }
         }
@@ -2429,7 +2436,7 @@ bad:
                 /* notify only if it has not failed due to FP Key error */
                 if ((p->p_lflag & P_LTERM_DECRYPTFAIL) == 0)
                         proc_knote(p, NOTE_EXEC);
-       } else {
+       } else if (error == 0) {
                 /* reset the importance attribute from our previous life */
                 task_importance_reset(p->task);
  
diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c

index c1bff3128f991905dddac61220a65dc40884129a..3d17f687c108126913aff2c2d25c116e0d81a69e 100644 (file)
--- a/bsd/kern/kern_exit.c
+++ b/bsd/kern/kern_exit.c
@@ -149,6 +149,7 @@ extern void dtrace_lazy_dofs_destroy(proc_t);
  
  #include <sys/sdt.h>
  
+extern boolean_t init_task_died;
  extern char init_task_failure_data[];
  void proc_prepareexit(proc_t p, int rv, boolean_t perf_notify);
  void vfork_exit(proc_t p, int rv);
@@ -354,6 +355,7 @@ exit1_internal(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, bo
                         sync(p, (void *)NULL, (int *)NULL);
                 }
  #endif
+               init_task_died = TRUE;
                 panic("%s died\nState at Last Exception:\n\n%s", 
                                                         (p->p_comm[0] != '\0' ?
                                                                 p->p_comm :
diff --git a/bsd/kern/kern_prot.c b/bsd/kern/kern_prot.c

index 1301dbeea3ce53914c8d011b735e3efa51ab54ab..23c602e8bb05d6cafe2590240b9592d914b6122b 100644 (file)
--- a/bsd/kern/kern_prot.c
+++ b/bsd/kern/kern_prot.c
@@ -720,42 +720,34 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval)
         kauth_cred_t my_cred, my_new_cred;
         posix_cred_t my_pcred;
  
-
         uid = uap->uid;
  
+       /* get current credential and take a reference while we muck with it */
         my_cred = kauth_cred_proc_ref(p);
         my_pcred = posix_cred_get(my_cred);
  
         DEBUG_CRED_ENTER("setuid (%d/%d): %p %d\n", p->p_pid, (p->p_pptr ? p->p_pptr->p_pid : 0), my_cred, uap->uid);
         AUDIT_ARG(uid, uid);
  
-       if (uid != my_pcred->cr_ruid &&         /* allow setuid(getuid()) */
-           uid != my_pcred->cr_svuid &&        /* allow setuid(saved uid) */
-           (error = suser(my_cred, &p->p_acflag))) {
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
-       /*
-        * Everything's okay, do it.
-        */
+       for (;;) {
+               if (uid != my_pcred->cr_ruid &&         /* allow setuid(getuid()) */
+                   uid != my_pcred->cr_svuid &&        /* allow setuid(saved uid) */
+                   (error = suser(my_cred, &p->p_acflag))) {
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
  
-       /*
-        * If we are priviledged, then set the saved and real UID too;
-        * otherwise, just set the effective UID
-        */
-       if (suser(my_cred, &p->p_acflag) == 0) {
-               svuid = uid;
-               ruid = uid;
                 /*
-                * Transfer proc count to new user.
-                * chgproccnt uses list lock for protection
+                * If we are privileged, then set the saved and real UID too;
+                * otherwise, just set the effective UID
                  */
-               (void)chgproccnt(uid, 1);
-               (void)chgproccnt(my_pcred->cr_ruid, -1);
-       }
-
-       /* get current credential and take a reference while we muck with it */
-       for (;;) {
+               if (suser(my_cred, &p->p_acflag) == 0) {
+                       svuid = uid;
+                       ruid = uid;
+               } else {
+                       svuid = KAUTH_UID_NONE;
+                       ruid = KAUTH_UID_NONE;
+               }
                 /*
                  * Only set the gmuid if the current cred has not opt'ed out;
                  * this normally only happens when calling setgroups() instead
@@ -780,17 +772,39 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval)
  
                         DEBUG_CRED_CHANGE("setuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags);
  
+                       /*
+                        * If we're changing the ruid from A to B, we might race with another thread that's setting ruid from B to A.
+                        * The current locking mechanisms don't allow us to make the entire credential switch operation atomic,
+                        * thus we may be able to change the process credentials from ruid A to B, but get preempted before incrementing the proc
+                        * count of B. If a second thread sees the new process credentials and switches back to ruid A, that other thread
+                        * may be able to decrement the proc count of B before we can increment it. This results in a panic.
+                        * Incrementing the proc count of the target ruid, B, before setting the process credentials prevents this race.
+                        */
+                       if (ruid != KAUTH_UID_NONE) {
+                               (void)chgproccnt(ruid, 1);
+                       }
+
                         proc_lock(p);
                         /*
                          * We need to protect for a race where another thread
                          * also changed the credential after we took our
                          * reference.  If p_ucred has changed then we should
                          * restart this again with the new cred.
+                        *
+                        * Note: the kauth_cred_setresuid has consumed a reference to my_cred, it p_ucred != my_cred, then my_cred must not be dereferenced!
                          */
                         if (p->p_ucred != my_cred) {
                                 proc_unlock(p);
+                               /*
+                                * We didn't successfully switch to the new ruid, so decrement
+                                * the procs/uid count that we incremented above.
+                                */
+                               if (ruid != KAUTH_UID_NONE) {
+                                       (void)chgproccnt(ruid, -1);
+                               }
                                 kauth_cred_unref(&my_new_cred);
                                 my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                 /* try again */
                                 continue;
                         }
@@ -800,6 +814,13 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval)
  
                         OSBitOrAtomic(P_SUGID, &p->p_flag);
                         proc_unlock(p);
+                       /*
+                        * If we've updated the ruid, decrement the count of procs running
+                        * under the previous ruid
+                        */
+                       if (ruid != KAUTH_UID_NONE) {
+                               (void)chgproccnt(my_pcred->cr_ruid, -1);
+                       }
                 }
                 break;
         }
@@ -845,18 +866,14 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval)
         my_cred = kauth_cred_proc_ref(p);
         my_pcred = posix_cred_get(my_cred);
  
-       if (euid != my_pcred->cr_ruid && euid != my_pcred->cr_svuid &&
-           (error = suser(my_cred, &p->p_acflag))) {
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
-
-       /*
-        * Everything's okay, do it.  Copy credentials so other references do
-        * not see our changes.  get current credential and take a reference 
-        * while we muck with it
-        */
         for (;;) {
+
+               if (euid != my_pcred->cr_ruid && euid != my_pcred->cr_svuid &&
+                       (error = suser(my_cred, &p->p_acflag))) {
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
+
                 /* 
                  * Set the credential with new info.  If there is no change,
                  * we get back the same credential we passed in; if there is
@@ -881,6 +898,7 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval)
                                 proc_unlock(p);
                                 kauth_cred_unref(&my_new_cred);
                                 my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                 /* try again */
                                 continue;
                         }
@@ -953,32 +971,25 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval)
         my_cred = kauth_cred_proc_ref(p);
         my_pcred = posix_cred_get(my_cred);
  
-       if (((ruid != KAUTH_UID_NONE &&         /* allow no change of ruid */
-             ruid != my_pcred->cr_ruid &&      /* allow ruid = ruid */
-             ruid != my_pcred->cr_uid &&       /* allow ruid = euid */
-             ruid != my_pcred->cr_svuid) ||    /* allow ruid = svuid */
-            (euid != KAUTH_UID_NONE &&         /* allow no change of euid */
-             euid != my_pcred->cr_uid &&       /* allow euid = euid */
-             euid != my_pcred->cr_ruid &&      /* allow euid = ruid */
-             euid != my_pcred->cr_svuid)) &&   /* allow euid = svui */
-           (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
-
-       /*
-        * Everything's okay, do it.  Copy credentials so other references do
-        * not see our changes.  get current credential and take a reference 
-        * while we muck with it
-        */
         for (;;) {
+
+               if (((ruid != KAUTH_UID_NONE &&         /* allow no change of ruid */
+                     ruid != my_pcred->cr_ruid &&      /* allow ruid = ruid */
+                     ruid != my_pcred->cr_uid &&       /* allow ruid = euid */
+                     ruid != my_pcred->cr_svuid) ||    /* allow ruid = svuid */
+                    (euid != KAUTH_UID_NONE &&         /* allow no change of euid */
+                     euid != my_pcred->cr_uid &&       /* allow euid = euid */
+                     euid != my_pcred->cr_ruid &&      /* allow euid = ruid */
+                     euid != my_pcred->cr_svuid)) &&   /* allow euid = svuid */
+                   (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
+
                 uid_t new_euid;
-               uid_t new_ruid;
                 uid_t svuid = KAUTH_UID_NONE;
  
                 new_euid = my_pcred->cr_uid;
-               new_ruid = my_pcred->cr_ruid;
-       
                 /* 
                  * Set the credential with new info.  If there is no change,
                  * we get back the same credential we passed in; if there is
@@ -986,19 +997,11 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval)
                  * passed in.  The subsequent compare is safe, because it is
                  * a pointer compare rather than a contents compare.
                  */
-               if (euid == KAUTH_UID_NONE && my_pcred->cr_uid != euid) {
+               if (euid != KAUTH_UID_NONE && my_pcred->cr_uid != euid) {
                         /* changing the effective UID */
                         new_euid = euid;
                         OSBitOrAtomic(P_SUGID, &p->p_flag);
                 }
-               if (ruid != KAUTH_UID_NONE && my_pcred->cr_ruid != ruid) {
-                       /* changing the real UID; must do user accounting */
-                       /* chgproccnt uses list lock for protection */
-                       (void)chgproccnt(ruid, 1);
-                       (void)chgproccnt(my_pcred->cr_ruid, -1);
-                       new_ruid = ruid;
-                       OSBitOrAtomic(P_SUGID, &p->p_flag);
-               }
                 /*
                  * If the newly requested real uid or effective uid does
                  * not match the saved uid, then set the saved uid to the
@@ -1017,25 +1020,56 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval)
  
                         DEBUG_CRED_CHANGE("setreuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags);
  
+                       /*
+                        * If we're changing the ruid from A to B, we might race with another thread that's setting ruid from B to A.
+                        * The current locking mechanisms don't allow us to make the entire credential switch operation atomic,
+                        * thus we may be able to change the process credentials from ruid A to B, but get preempted before incrementing the proc
+                        * count of B. If a second thread sees the new process credentials and switches back to ruid A, that other thread
+                        * may be able to decrement the proc count of B before we can increment it. This results in a panic.
+                        * Incrementing the proc count of the target ruid, B, before setting the process credentials prevents this race.
+                        */
+                       if (ruid != KAUTH_UID_NONE) {
+                               (void)chgproccnt(ruid, 1);
+                       }
+
                         proc_lock(p);
                         /*
                          * We need to protect for a race where another thread
                          * also changed the credential after we took our
                          * reference.  If p_ucred has changed then we should
                          * restart this again with the new cred.
+                        *
+                        * Note: the kauth_cred_setresuid has consumed a reference to my_cred, it p_ucred != my_cred, then my_cred must not be dereferenced!
                          */
                         if (p->p_ucred != my_cred) {
                                 proc_unlock(p);
+                               if (ruid != KAUTH_UID_NONE) {
+                                       /*
+                                        * We didn't successfully switch to the new ruid, so decrement
+                                        * the procs/uid count that we incremented above.
+                                        */
+                                       (void)chgproccnt(ruid, -1);
+                               }
                                 kauth_cred_unref(&my_new_cred);
                                 my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                 /* try again */
                                 continue;
                         }
+
                         p->p_ucred = my_new_cred;
                         /* update cred on proc */
                         PROC_UPDATE_CREDS_ONPROC(p);
-                       OSBitOrAtomic(P_SUGID, &p->p_flag); /* XXX redundant? */
+                       OSBitOrAtomic(P_SUGID, &p->p_flag);
                         proc_unlock(p);
+
+                       if (ruid != KAUTH_UID_NONE) {
+                               /*
+                                * We switched to a new ruid, so decrement the count of procs running
+                                * under the previous ruid
+                                */
+                               (void)chgproccnt(my_pcred->cr_ruid, -1);
+                       }
                 }
                 break;
         }
@@ -1087,28 +1121,30 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval)
         gid = uap->gid;
         AUDIT_ARG(gid, gid);
  
+       /* get current credential and take a reference while we muck with it */
         my_cred = kauth_cred_proc_ref(p);
         my_pcred = posix_cred_get(my_cred);
  
-       if (gid != my_pcred->cr_rgid &&         /* allow setgid(getgid()) */
-           gid != my_pcred->cr_svgid &&        /* allow setgid(saved gid) */
-           (error = suser(my_cred, &p->p_acflag))) {
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
+       for (;;) {
+               if (gid != my_pcred->cr_rgid &&         /* allow setgid(getgid()) */
+                   gid != my_pcred->cr_svgid &&        /* allow setgid(saved gid) */
+                   (error = suser(my_cred, &p->p_acflag))) {
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
  
-       /*
-        * If we are priviledged, then set the saved and real GID too;
-        * otherwise, just set the effective GID
-        */
-       if (suser(my_cred,  &p->p_acflag) == 0) {
-               svgid = gid;
-               rgid = gid;
-       }
+               /*
+                * If we are privileged, then set the saved and real GID too;
+                * otherwise, just set the effective GID
+                */
+               if (suser(my_cred,  &p->p_acflag) == 0) {
+                       svgid = gid;
+                       rgid = gid;
+               } else {
+                       svgid = KAUTH_GID_NONE;
+                       rgid = KAUTH_GID_NONE;
+               }
  
-       /* get current credential and take a reference while we muck with it */
-       for (;;) {
-               
                 /* 
                  * Set the credential with new info.  If there is no change,
                  * we get back the same credential we passed in; if there is
@@ -1133,6 +1169,7 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval)
                                 kauth_cred_unref(&my_new_cred);
                                 /* try again */
                                 my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                 continue;
                         }
                         p->p_ucred = my_new_cred;
@@ -1187,18 +1224,18 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval)
         egid = uap->egid;
         AUDIT_ARG(egid, egid);
  
+       /* get current credential and take a reference while we muck with it */
         my_cred = kauth_cred_proc_ref(p);
         my_pcred = posix_cred_get(my_cred);
  
-       if (egid != my_pcred->cr_rgid &&
-           egid != my_pcred->cr_svgid &&
-           (error = suser(my_cred, &p->p_acflag))) {
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
  
-       /* get current credential and take a reference while we muck with it */
         for (;;) {
+               if (egid != my_pcred->cr_rgid &&
+                   egid != my_pcred->cr_svgid &&
+                   (error = suser(my_cred, &p->p_acflag))) {
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
                 /* 
                  * Set the credential with new info.  If there is no change,
                  * we get back the same credential we passed in; if there is
@@ -1223,6 +1260,7 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval)
                                 kauth_cred_unref(&my_new_cred);
                                 /* try again */
                                 my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                 continue;
                         }
                         p->p_ucred = my_new_cred;
@@ -1298,25 +1336,26 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval)
         AUDIT_ARG(egid, egid);
         AUDIT_ARG(rgid, rgid);
  
+       /* get current credential and take a reference while we muck with it */
         my_cred = kauth_cred_proc_ref(p);
         my_pcred = posix_cred_get(my_cred);
  
-       if (((rgid != KAUTH_UID_NONE &&         /* allow no change of rgid */
-             rgid != my_pcred->cr_rgid &&      /* allow rgid = rgid */
-             rgid != my_pcred->cr_gid &&       /* allow rgid = egid */
-             rgid != my_pcred->cr_svgid) ||    /* allow rgid = svgid */
-            (egid != KAUTH_UID_NONE &&         /* allow no change of egid */
-             egid != my_pcred->cr_groups[0] && /* allow no change of egid */
-             egid != my_pcred->cr_gid &&       /* allow egid = egid */
-             egid != my_pcred->cr_rgid &&      /* allow egid = rgid */
-             egid != my_pcred->cr_svgid)) &&   /* allow egid = svgid */
-           (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
-               kauth_cred_unref(&my_cred);
-               return (error);
-       }
-
-       /* get current credential and take a reference while we muck with it */
         for (;;) {
+
+               if (((rgid != KAUTH_UID_NONE &&         /* allow no change of rgid */
+                     rgid != my_pcred->cr_rgid &&      /* allow rgid = rgid */
+                     rgid != my_pcred->cr_gid &&       /* allow rgid = egid */
+                     rgid != my_pcred->cr_svgid) ||    /* allow rgid = svgid */
+                    (egid != KAUTH_UID_NONE &&         /* allow no change of egid */
+                     egid != my_pcred->cr_groups[0] && /* allow no change of egid */
+                     egid != my_pcred->cr_gid &&       /* allow egid = egid */
+                     egid != my_pcred->cr_rgid &&      /* allow egid = rgid */
+                     egid != my_pcred->cr_svgid)) &&   /* allow egid = svgid */
+                   (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */
+                       kauth_cred_unref(&my_cred);
+                       return (error);
+               }
+
                 uid_t new_egid = my_pcred->cr_gid;
                 uid_t new_rgid = my_pcred->cr_rgid;
                 uid_t svgid = KAUTH_UID_NONE;
@@ -1329,7 +1368,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval)
                  * passed in.  The subsequent compare is safe, because it is
                  * a pointer compare rather than a contents compare.
                  */
-               if (egid == KAUTH_UID_NONE && my_pcred->cr_gid != egid) {
+               if (egid != KAUTH_UID_NONE && my_pcred->cr_gid != egid) {
                         /* changing the effective GID */
                         new_egid = egid;
                         OSBitOrAtomic(P_SUGID, &p->p_flag);
@@ -1367,6 +1406,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval)
                                 kauth_cred_unref(&my_new_cred);
                                 /* try again */
                                 my_cred = kauth_cred_proc_ref(p);
+                               my_pcred = posix_cred_get(my_cred);
                                 continue;
                         }
                         p->p_ucred = my_new_cred;
@@ -1387,7 +1427,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval)
  
  /*
   * Set the per-thread override identity.  The first parameter can be the
- * current real UID, KAUTH_UID_NONE, or, if the caller is priviledged, it
+ * current real UID, KAUTH_UID_NONE, or, if the caller is privileged, it
   * can be any UID.  If it is KAUTH_UID_NONE, then as a special case, this
   * means "revert to the per process credential"; otherwise, if permitted,
   * it changes the effective, real, and saved UIDs and GIDs for the current
diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c

index 251629b5b1a85afe64f35ab6191ddead502e666d..c7978d82e01270e2bf692e3fa9baf59fb9ccb55c 100644 (file)
--- a/bsd/kern/kern_sysctl.c
+++ b/bsd/kern/kern_sysctl.c
@@ -180,6 +180,11 @@ extern unsigned int vm_page_free_reserved;
  extern unsigned int vm_page_speculative_percentage;
  extern unsigned int vm_page_speculative_q_age_ms;
  
+#if (DEVELOPMENT || DEBUG)
+extern uint32_t        vm_page_creation_throttled_hard;
+extern uint32_t        vm_page_creation_throttled_soft;
+#endif /* DEVELOPMENT || DEBUG */
+
  /*
   * Conditionally allow dtrace to see these functions for debugging purposes.
   */
@@ -2660,6 +2665,7 @@ SYSCTL_INT(_vm, OID_AUTO, vm_page_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &
  SYSCTL_INT(_vm, OID_AUTO, vm_page_filecache_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_page_filecache_min, 0, "");
  
  extern int     vm_compressor_mode;
+extern int     vm_compressor_is_active;
  extern uint32_t        swapout_target_age;
  extern int64_t  compressor_bytes_used;
  extern uint32_t        compressor_eval_period_in_msecs;
@@ -2673,6 +2679,7 @@ extern uint32_t   vm_compressor_unthrottle_threshold_divisor;
  extern uint32_t        vm_compressor_catchup_threshold_divisor;
  
  SYSCTL_INT(_vm, OID_AUTO, compressor_mode, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_mode, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, compressor_is_active, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_is_active, 0, "");
  SYSCTL_QUAD(_vm, OID_AUTO, compressor_bytes_used, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_bytes_used, "");
  SYSCTL_INT(_vm, OID_AUTO, compressor_swapout_target_age, CTLFLAG_RD | CTLFLAG_LOCKED, &swapout_target_age, 0, "");
  
@@ -2699,6 +2706,18 @@ SYSCTL_INT(_vm, OID_AUTO, phantom_cache_thrashing_threshold, CTLFLAG_RW | CTLFLA
  SYSCTL_INT(_vm, OID_AUTO, phantom_cache_thrashing_threshold_ssd, CTLFLAG_RW | CTLFLAG_LOCKED, &phantom_cache_thrashing_threshold_ssd, 0, "");
  #endif
  
+#if (DEVELOPMENT || DEBUG)
+
+SYSCTL_UINT(_vm, OID_AUTO, vm_page_creation_throttled_hard,
+           CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
+           &vm_page_creation_throttled_hard, 0, "");
+
+SYSCTL_UINT(_vm, OID_AUTO, vm_page_creation_throttled_soft,
+           CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
+           &vm_page_creation_throttled_soft, 0, "");
+
+#endif /* DEVELOPMENT || DEBUG */
+
  /*
   * Enable tracing of voucher contents
   */
diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c

index c590c52d69bc06494792ba87eba176923935575a..0f477bdf07927899dd8b4e6e4c62ce11d91f58da 100644 (file)
--- a/bsd/kern/mach_loader.c
+++ b/bsd/kern/mach_loader.c
@@ -113,7 +113,8 @@ static load_result_t load_result_null = {
         .csflags = 0,
         .uuid = { 0 },
         .min_vm_addr = MACH_VM_MAX_ADDRESS,
-       .max_vm_addr = MACH_VM_MIN_ADDRESS
+       .max_vm_addr = MACH_VM_MIN_ADDRESS,
+       .cs_end_offset = 0
  };
  
  /*
@@ -772,6 +773,37 @@ parse_machfile(
                                 } else {
                                         got_code_signatures = TRUE;
                                 }
+
+                               if (got_code_signatures) {
+                                       boolean_t valid = FALSE, tainted = TRUE;
+                                       struct cs_blob *blobs;
+                                       vm_size_t off = 0;
+
+
+                                       if (cs_debug > 10)
+                                               printf("validating initial pages of %s\n", vp->v_name);
+                                       blobs = ubc_get_cs_blobs(vp);
+                                       
+                                       while (off < size && ret == LOAD_SUCCESS) {
+                                            valid = cs_validate_page(blobs,
+                                                                     NULL,
+                                                                     file_offset + off,
+                                                                     addr + off,
+                                                                     &tainted);
+                                            if (!valid || tainted) {
+                                                    if (cs_debug)
+                                                            printf("CODE SIGNING: %s[%d]: invalid initial page at offset %lld validated:%d tainted:%d csflags:0x%x\n", 
+                                                                   vp->v_name, p->p_pid, (long long)(file_offset + off), valid, tainted, result->csflags);
+                                                    if (cs_enforcement(NULL) ||
+                                                        (result->csflags & (CS_HARD|CS_KILL|CS_ENFORCEMENT))) {
+                                                            ret = LOAD_FAILURE;
+                                                    }
+                                                    result->csflags &= ~CS_VALID;
+                                            }
+                                            off += PAGE_SIZE;
+                                       }
+                               }
+
                                 break;
  #if CONFIG_CODE_DECRYPTION
                         case LC_ENCRYPTION_INFO:
@@ -991,6 +1023,20 @@ load_segment(
         if ((scp->fileoff & PAGE_MASK_64) != 0)
                 return (LOAD_BADMACHO);
  
+       /*
+        * If we have a code signature attached for this slice
+        * require that the segments are within the signed part
+        * of the file.
+        */
+       if (result->cs_end_offset &&
+           result->cs_end_offset < (off_t)scp->fileoff &&
+           result->cs_end_offset - scp->fileoff < scp->filesize)
+        {
+               if (cs_debug)
+                       printf("section outside code signature\n");
+               return LOAD_BADMACHO;
+       }
+
         /*
          *      Round sizes to page size.
          */
@@ -1290,25 +1336,46 @@ load_threadstate(
         uint32_t        size;
         int             flavor;
         uint32_t        thread_size;
+       uint32_t        *local_ts;
+       uint32_t        local_ts_size;
  
-    ret = thread_state_initialize( thread );
-    if (ret != KERN_SUCCESS) {
-        return(LOAD_FAILURE);
-    }
+       local_ts = NULL;
+       local_ts_size = 0;
+
+       ret = thread_state_initialize( thread );
+       if (ret != KERN_SUCCESS) {
+               ret = LOAD_FAILURE;
+               goto done;
+       }
      
+       if (total_size > 0) {
+               local_ts_size = total_size;
+               local_ts = kalloc(local_ts_size);
+               if (local_ts == NULL) {
+                       ret = LOAD_FAILURE;
+                       goto done;
+               }
+               memcpy(local_ts, ts, local_ts_size);
+               ts = local_ts;
+       }
+
         /*
-        *      Set the new thread state; iterate through the state flavors in
-     *  the mach-o file.
+        * Set the new thread state; iterate through the state flavors in
+        * the mach-o file.
          */
         while (total_size > 0) {
                 flavor = *ts++;
                 size = *ts++;
                 if (UINT32_MAX-2 < size ||
-                   UINT32_MAX/sizeof(uint32_t) < size+2)
-                       return (LOAD_BADMACHO);
+                   UINT32_MAX/sizeof(uint32_t) < size+2) {
+                       ret = LOAD_BADMACHO;
+                       goto done;
+               }
                 thread_size = (size+2)*sizeof(uint32_t);
-               if (thread_size > total_size)
-                       return(LOAD_BADMACHO);
+               if (thread_size > total_size) {
+                       ret = LOAD_BADMACHO;
+                       goto done;
+               }
                 total_size -= thread_size;
                 /*
                  * Third argument is a kernel space pointer; it gets cast
@@ -1317,11 +1384,19 @@ load_threadstate(
                  */
                 ret = thread_setstatus(thread, flavor, (thread_state_t)ts, size);
                 if (ret != KERN_SUCCESS) {
-                       return(LOAD_FAILURE);
+                       ret = LOAD_FAILURE;
+                       goto done;
                 }
                 ts += size;     /* ts is a (uint32_t *) */
         }
-       return(LOAD_SUCCESS);
+       ret = LOAD_SUCCESS;
+
+done:
+       if (local_ts != NULL) {
+               kfree(local_ts, local_ts_size);
+               local_ts = NULL;
+       }
+       return ret;
  }
  
  static
@@ -1584,7 +1659,7 @@ load_code_signature(
                 goto out;
         }
  
-       blob = ubc_cs_blob_get(vp, cputype, -1);
+       blob = ubc_cs_blob_get(vp, cputype, macho_offset);
         if (blob != NULL) {
                 /* we already have a blob for this vnode and cputype */
                 if (blob->csb_cpu_type == cputype &&
@@ -1644,13 +1719,14 @@ load_code_signature(
         ubc_cs_validation_bitmap_allocate( vp );
  #endif
                 
-       blob = ubc_cs_blob_get(vp, cputype, -1);
+       blob = ubc_cs_blob_get(vp, cputype, macho_offset);
  
         ret = LOAD_SUCCESS;
  out:
         if (ret == LOAD_SUCCESS) {
                 result->csflags |= blob->csb_flags;
                 result->platform_binary = blob->csb_platform_binary;
+               result->cs_end_offset = blob->csb_end_offset;
         }
         if (addr != 0) {
                 ubc_cs_blob_deallocate(addr, blob_size);
diff --git a/bsd/kern/mach_loader.h b/bsd/kern/mach_loader.h

index d1c83d1f9f12badc884aa33ad49438bbf1a8e333..dc0dbfa5bc3c1e0701ffd3a9cdd9183bc558b9ae 100644 (file)
--- a/bsd/kern/mach_loader.h
+++ b/bsd/kern/mach_loader.h
@@ -70,6 +70,7 @@ typedef struct _load_result {
         mach_vm_address_t       min_vm_addr;
         mach_vm_address_t       max_vm_addr;
         unsigned int            platform_binary;
+       off_t                   cs_end_offset;
  } load_result_t;
  
  struct image_params;
diff --git a/bsd/kern/makekdebugevents.py b/bsd/kern/makekdebugevents.py

new file mode 100755 (executable)

index 0000000..73b2db4
--- /dev/null
+++ b/bsd/kern/makekdebugevents.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python
+#
+# This script scans the trace.codes file, containing a mapping of event id to
+# event name for all events, and writes to stdout a C declaration for a table
+# named kd_events[] or these mappings.
+# Required to generate a header file used by DEVELOPMENT and DEBUG kernels.
+#
+ 
+import sys
+import re
+
+# we expect one arg specifying the path to the trace.codes file
+if (len(sys.argv) < 2):
+    exit(1)
+trace_code_file = sys.argv[1]
+
+# regular expression pattern to match <hex_id> <string>
+id_name_pattern = re.compile('0x([0-9a-fA-F]+)\s+([^\s]*)')
+code_table = []
+
+# scan file to generate internal table
+with open(trace_code_file, 'rt') as codes:
+    for line in codes:
+       m = id_name_pattern.match(line)
+       if m:
+            code_table += [(int(m.group(1),base=16), m.group(2))]
+
+# emit typedef:
+print "typedef struct {"
+print "        uint32_t   id;"
+print "        const char *name;"
+print "} kd_event_t;"
+# emit structure declaration and sorted initialization:
+print "kd_event_t kd_events[] = {"
+for mapping in sorted(code_table, key=lambda x: x[0]):
+        print "        {0x%x, \"%s\"}," % mapping
+print "};"
+
diff --git a/bsd/kern/proc_info.c b/bsd/kern/proc_info.c

index 2af5cc29e2b81423594279252ad318f2b102eae4..a2b82a6e4280951e9bb32fad0a7aec1ef2f59fa0 100644 (file)
--- a/bsd/kern/proc_info.c
+++ b/bsd/kern/proc_info.c
@@ -1663,6 +1663,7 @@ fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo)
                 struct stat64 sb;
                 int error = 0;
  
+               bzero(&sb, sizeof(struct stat64));
                 context = vfs_context_create((vfs_context_t)0);
                 error = vn_stat(vp, &sb, NULL, 1, context);
                 (void)vfs_context_rele(context);
diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c

index c839e868f364ddcf822a0d86e00c403650e62bc9..1247ff35582114d500385bcbfd83b7bd4d05e9b5 100644 (file)
--- a/bsd/kern/sys_generic.c
+++ b/bsd/kern/sys_generic.c
@@ -1487,7 +1487,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
                 /* Handle input events */
                 if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP )) {
                         kev.filter = EVFILT_READ;
-                       if (!(events & ( POLLIN | POLLRDNORM )))
+                       if (events & ( POLLPRI | POLLRDBAND ))
                                 kev.flags |= EV_OOBAND;
                         kerror = kevent_register(kq, &kev, p);
                 }
@@ -1559,7 +1559,7 @@ poll_callback(__unused struct kqueue *kq, struct kevent64_s *kevp, void *data)
         struct poll_continue_args *cont = (struct poll_continue_args *)data;
         struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
         short prev_revents = fds->revents;
-       short mask;
+       short mask = 0;
  
         /* convert the results back into revents */
         if (kevp->flags & EV_EOF)
@@ -1572,7 +1572,8 @@ poll_callback(__unused struct kqueue *kq, struct kevent64_s *kevp, void *data)
                 if (fds->revents & POLLHUP)
                         mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
                 else {
-                       mask = (POLLIN | POLLRDNORM );
+                       if ((kevp->flags & EV_ERROR) == 0 && kevp->data != 0)
+                               mask = (POLLIN | POLLRDNORM );
                         if (kevp->flags & EV_OOBAND)
                                 mask |= ( POLLPRI | POLLRDBAND );
                 }
diff --git a/bsd/kern/trace.codes b/bsd/kern/trace.codes

index e08f59ff53766cc25fdd7e24330b80778ccd7923..57de6588d46805c05e5499f3fcf3766ed15f10bb 100644 (file)
--- a/bsd/kern/trace.codes
+++ b/bsd/kern/trace.codes
@@ -321,6 +321,7 @@
  0x1a2001c      SFI_WAIT_CANCELED
  0x1a20020      SFI_PID_SET_MANAGED
  0x1a20024      SFI_PID_CLEAR_MANAGED
+0x1a20028      SFI_GLOBAL_DEFER
  0x1a30004      ENERGY_PERF_GPU_DESCRIPTION
  0x1a30008      ENERGY_PERF_GPU_TIME
  0x2010000      L_IP_In_Beg
@@ -1550,6 +1551,7 @@
  0x5310278      CPUPM_PST_UIB
  0x531027C      CPUPM_PST_PLIMIT_UIB
  0x5310280      CPUPM_IO
+0x5310284      CPUPM_FI
  0x5330000      HIBERNATE
  0x5330004      HIBERNATE_WRITE_IMAGE
  0x5330008      HIBERNATE_MACHINE_INIT
diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c

index 6b57b3cf8733da668789a871b9f52f41dafc6f3f..2dc33d759b411b7d92e1acb88711bd0e45d82eae 100644 (file)
--- a/bsd/kern/uipc_socket.c
+++ b/bsd/kern/uipc_socket.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   *
@@ -5392,6 +5392,17 @@ soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
         switch (kn->kn_filter) {
         case EVFILT_READ:
                 kn->kn_fop = &soread_filtops;
+               /*
+                * If the caller explicitly asked for OOB results (e.g. poll()),
+                * save that off in the hookid field and reserve the kn_flags
+                * EV_OOBAND bit for output only).
+                */
+               if (kn->kn_flags & EV_OOBAND) {
+                       kn->kn_flags &= ~EV_OOBAND;
+                       kn->kn_hookid = EV_OOBAND;
+               } else {
+                       kn->kn_hookid = 0;
+               }
                 skl = &so->so_rcv.sb_sel.si_note;
                 break;
         case EVFILT_WRITE:
@@ -5467,44 +5478,42 @@ filt_soread(struct knote *kn, long hint)
         }
  
         /* socket isn't a listener */
-
         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
+       /*
+        * Clear out EV_OOBAND that filt_soread may have set in the
+        * past.
+        */
+       kn->kn_flags &= ~EV_OOBAND;
  
-       if (so->so_oobmark) {
-               if (kn->kn_flags & EV_OOBAND) {
+       if ((so->so_oobmark) || (so->so_state & SS_RCVATMARK)){
+               kn->kn_flags |= EV_OOBAND;
+               /*
+                * If caller registered explicit interest in OOB data,
+                * return immediately (data == amount beyond mark, for
+                * legacy reasons - that should be changed later).
+                */
+               if (kn->kn_hookid == EV_OOBAND) {
+                       /*
+                        * When so_state is SS_RCVATMARK, so_oobmark
+                        * is 0.
+                        */
                         kn->kn_data -= so->so_oobmark;
                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
                                 socket_unlock(so, 1);
                         return (1);
                 }
-               kn->kn_data = so->so_oobmark;
-               kn->kn_flags |= EV_OOBAND;
-       } else {
-               if ((so->so_state & SS_CANTRCVMORE)
+       }
+       
+       if ((so->so_state & SS_CANTRCVMORE)
  #if CONTENT_FILTER
-               && cfil_sock_data_pending(&so->so_rcv) == 0
+           && cfil_sock_data_pending(&so->so_rcv) == 0
  #endif /* CONTENT_FILTER */
-               ) {
-                       kn->kn_flags |= EV_EOF;
-                       kn->kn_fflags = so->so_error;
-                       if ((hint & SO_FILT_HINT_LOCKED) == 0)
-                               socket_unlock(so, 1);
-                       return (1);
-               }
-       }
-
-       if (so->so_state & SS_RCVATMARK) {
-               if (kn->kn_flags & EV_OOBAND) {
-                       if ((hint & SO_FILT_HINT_LOCKED) == 0)
-                               socket_unlock(so, 1);
-                       return (1);
-               }
-               kn->kn_flags |= EV_OOBAND;
-       } else if (kn->kn_flags & EV_OOBAND) {
-               kn->kn_data = 0;
+          ) {
+               kn->kn_flags |= EV_EOF;
+               kn->kn_fflags = so->so_error;
                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
                         socket_unlock(so, 1);
-               return (0);
+               return (1);
         }
  
         if (so->so_error) {     /* temporary udp error */
@@ -5524,7 +5533,7 @@ filt_soread(struct knote *kn, long hint)
         if ((hint & SO_FILT_HINT_LOCKED) == 0)
                 socket_unlock(so, 1);
  
-       return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
+       return (kn->kn_data >= lowwat);
  }
  
  static void
diff --git a/bsd/man/man2/kqueue.2 b/bsd/man/man2/kqueue.2

index 201ad5de36ab3c0506a04b20322e423a0d3dfe7b..c3e668072bc0b61145571253ec603c095881d447 100644 (file)
--- a/bsd/man/man2/kqueue.2
+++ b/bsd/man/man2/kqueue.2
@@ -1,5 +1,5 @@
  .\"
-.\" Copyright (c) 2008 Apple Inc.  All rights reserved.
+.\" Copyright (c) 2008-2015 Apple Inc.  All rights reserved.
  .\"
  .\" @APPLE_LICENSE_HEADER_START@
  .\" 
@@ -281,6 +281,9 @@ instead of the current state.  Note that some filters may automatically
  set this flag internally.
  .It EV_EOF
  Filters may set this flag to indicate filter-specific EOF condition.
+.It EV_OOBAND
+Read filter on socket may set this flag to indicate the presence of out of
+band data on the descriptor.
  .It EV_ERROR
  See
  .Sx RETURN VALUES
@@ -329,6 +332,12 @@ On return,
  .Va data
  contains the number of bytes of protocol data available to read.
  .Pp
+The presence of EV_OOBAND in
+.Va flags ,
+indicates the presence of out of band data on the socket
+.Va data
+equal to the potential number of OOB bytes availble to read.
+.Pp
  If the read direction of the socket has shutdown, then the filter
  also sets EV_EOF in
  .Va flags ,
diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c

index 0fd816243abcdd5441f328bf4be4228c0ead5014..fe762c21fa36237187ba40beaacba3d90554fe44 100644 (file)
--- a/bsd/miscfs/specfs/spec_vnops.c
+++ b/bsd/miscfs/specfs/spec_vnops.c
@@ -2401,10 +2401,9 @@ filt_specdetach(struct knote *kn)
         if (ret != KERN_SUCCESS) {
                 panic("filt_specdetach(): failed to unlink wait queue link.");
         }
-
+       knote_clearstayqueued(kn);
         (void)wait_queue_link_free(kn->kn_hook);
         kn->kn_hook = NULL;
-       kn->kn_status &= ~KN_STAYQUEUED;
  }
  
  static int 
diff --git a/bsd/net/if_bridge.c b/bsd/net/if_bridge.c

index 37bea9581365c1a874d034993df8b6cca58459f0..98fff2803da508cb9b4eb7dd7fa9d3c9cf0e4d5f 100644 (file)
--- a/bsd/net/if_bridge.c
+++ b/bsd/net/if_bridge.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2004-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   *
@@ -2087,6 +2087,14 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
         BRIDGE_LOCK_ASSERT_HELD(sc);
         VERIFY(ifs != NULL);
  
+       /*
+        * First, remove the member from the list first so it cannot be found anymore
+        * when we release the bridge lock below
+        */
+       BRIDGE_XLOCK(sc);
+       TAILQ_REMOVE(&sc->sc_iflist, bif, bif_next);
+       BRIDGE_XDROP(sc);
+
         if (!gone) {
                 switch (ifs->if_type) {
                 case IFT_ETHER:
@@ -2094,8 +2102,15 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
                         /*
                          * Take the interface out of promiscuous mode.
                          */
-                       if (bif->bif_flags & BIFF_PROMISC)
+                       if (bif->bif_flags & BIFF_PROMISC) {
+                               /*
+                                * Unlock to prevent deadlock with bridge_iff_event() in
+                                * case the driver generates an interface event
+                                */
+                               BRIDGE_UNLOCK(sc);
                                 (void) ifnet_set_promiscuous(ifs, 0);
+                               BRIDGE_LOCK(sc);
+                       }
                         break;
  
                 case IFT_GIF:
@@ -2123,10 +2138,6 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
                 bstp_disable(&bif->bif_stp);
  #endif /* BRIDGESTP */
  
-       BRIDGE_XLOCK(sc);
-       TAILQ_REMOVE(&sc->sc_iflist, bif, bif_next);
-       BRIDGE_XDROP(sc);
-
         /*
          * If removing the interface that gave the bridge its mac address, set
          * the mac address of the bridge to the address of the next member, or
diff --git a/bsd/netinet/in_systm.h b/bsd/netinet/in_systm.h

index 3ea2612a6ad38bd345b2f0204849f309f955ac6d..e0972f831c9898718c75d95481cd9dd6d0879923 100644 (file)
--- a/bsd/netinet/in_systm.h
+++ b/bsd/netinet/in_systm.h
@@ -85,6 +85,6 @@ typedef __uint32_t n_long;            /* long as received from the net */
  typedef        __uint32_t n_time;              /* ms since 00:00 GMT, byte rev */
  
  #ifdef BSD_KERNEL_PRIVATE
-n_time  iptime(void);
+u_int32_t iptime(void);
  #endif /* BSD_KERNEL_PRIVATE */
  #endif /* _NETINET_IN_SYSTM_H_ */
diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c

index ba2869a505cd0a30e15247e0b025fb1cf09fb5ec..256d54b8fc5e02b7ff8d0cd050ab429d8649bc16 100644 (file)
--- a/bsd/netinet/ip_icmp.c
+++ b/bsd/netinet/ip_icmp.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -124,24 +124,31 @@
   */
  
  struct icmpstat icmpstat;
-SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED,
-       &icmpstat, icmpstat, "");
+SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats,
+    CTLFLAG_RD | CTLFLAG_LOCKED,
+    &icmpstat, icmpstat, "");
  
  static int     icmpmaskrepl = 0;
-SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &icmpmaskrepl, 0, "");
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl,
+    CTLFLAG_RW | CTLFLAG_LOCKED,
+    &icmpmaskrepl, 0, "");
  
  static int     icmptimestamp = 0;
-SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &icmptimestamp, 0, "");
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp,
+    CTLFLAG_RW | CTLFLAG_LOCKED,
+    &icmptimestamp, 0, "");
  
-static int     drop_redirect = 0;
-SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &drop_redirect, 0, "");
+static int     drop_redirect = 1;
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect,
+    CTLFLAG_RW | CTLFLAG_LOCKED,
+    &drop_redirect, 0, "");
  
  static int     log_redirect = 0;
-SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &log_redirect, 0, "");
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect,
+    CTLFLAG_RW | CTLFLAG_LOCKED,
+    &log_redirect, 0, "");
+
+static int icmp_datalen = 8;
  
  #if ICMP_BANDLIM 
  
@@ -192,19 +199,19 @@ icmp_error(
         struct mbuf *n,
         int type,
         int code,
-       n_long dest,
+       u_int32_t dest,
         u_int32_t nextmtu)
  {
-       struct ip *oip = mtod(n, struct ip *), *nip;
-       unsigned oiplen;
+       struct ip *oip, *nip;
         struct icmp *icp;
         struct mbuf *m;
-       unsigned icmplen;
+       u_int32_t oiphlen, icmplen, icmpelen, nlen;
  
         /* Expect 32-bit aligned data pointer on strict-align platforms */
         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(n);
  
-       oiplen = IP_VHL_HL(oip->ip_vhl) << 2;
+       oip = mtod(n, struct ip *);
+       oiphlen = IP_VHL_HL(oip->ip_vhl) << 2;
  
  #if ICMPPRINTFS
         if (icmpprintfs)
@@ -218,44 +225,92 @@ icmp_error(
          * Don't error if the old packet protocol was ICMP
          * error message, only known informational types.
          */
-       if (oip->ip_off &~ (IP_MF|IP_DF))
+       if (oip->ip_off & ~(IP_MF|IP_DF))
                 goto freeit;
+
         if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
-         n->m_len >= oiplen + ICMP_MINLEN &&
-         !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiplen))->
+         n->m_len >= oiphlen + ICMP_MINLEN &&
+         !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiphlen))->
           icmp_type)) {
                 icmpstat.icps_oldicmp++;
                 goto freeit;
         }
-       /* Don't send error in response to a multicast or broadcast packet */
+       /*
+        * Don't send error in response to a multicast or
+        * broadcast packet
+        */
         if (n->m_flags & (M_BCAST|M_MCAST))
                 goto freeit;
+
+       /*
+        * Calculate the length to quote from original packet and prevent
+        * the ICMP mbuf from overflowing.
+        */
+       nlen = m_length(n);
+       if (oip->ip_p == IPPROTO_TCP) {
+               struct tcphdr *th;
+               u_int16_t tcphlen;
+
+               if (oiphlen + sizeof(struct tcphdr) > n->m_len &&
+                   n->m_next == NULL)
+                       goto stdreply;
+               if (n->m_len < (oiphlen + sizeof(struct tcphdr)) &&
+                   (n = m_pullup(n, (oiphlen + sizeof(struct tcphdr)))) == NULL)
+                       goto freeit;
+
+               th = (struct tcphdr *)(void *)((caddr_t)oip + oiphlen);
+               if (th != ((struct tcphdr *)P2ROUNDDOWN(th,
+                   sizeof(u_int32_t))))
+                       goto freeit;
+               tcphlen = th->th_off << 2;
+               if (tcphlen < sizeof(struct tcphdr))
+                       goto freeit;
+               if (oip->ip_len < (oiphlen + tcphlen))
+                       goto freeit;
+               if ((oiphlen + tcphlen) > n->m_len && n->m_next == NULL)
+                       goto stdreply;
+               if (n->m_len < (oiphlen + tcphlen) &&
+                   (n = m_pullup(n, (oiphlen + tcphlen))) == NULL)
+                       goto freeit;
+
+               icmpelen = max(tcphlen, min(icmp_datalen,
+                   (oip->ip_len - oiphlen)));
+       } else
+stdreply:      icmpelen = max(ICMP_MINLEN, min(icmp_datalen,
+                   (ntohs(oip->ip_len) - oiphlen)));
+
+       icmplen = min(oiphlen + icmpelen, min(nlen, oip->ip_len));
+       if (icmplen < sizeof(struct ip))
+               goto freeit;
         /*
          * First, formulate icmp message
          */
-       m = m_gethdr(M_DONTWAIT, MT_HEADER);    /* MAC-OK */
+       if (MHLEN > (sizeof(struct ip) + ICMP_MINLEN + icmplen))
+               m = m_gethdr(M_DONTWAIT, MT_HEADER);    /* MAC-OK */
+       else 
+               m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+
         if (m == NULL)
                 goto freeit;
  
-        if (n->m_flags & M_SKIP_FIREWALL) {
-               /* set M_SKIP_FIREWALL to skip firewall check, since we're called from firewall */
+       if (n->m_flags & M_SKIP_FIREWALL) {
+               /*
+                * set M_SKIP_FIREWALL to skip firewall check, since
+                * we're called from firewall
+                */
                 m->m_flags |= M_SKIP_FIREWALL;
         }
  
  #if CONFIG_MACF_NET
         mac_mbuf_label_associate_netlayer(n, m);
  #endif
-       icmplen = min(oiplen + 8, oip->ip_len);
-       if (icmplen < sizeof(struct ip)) {
-               printf("icmp_error: bad length\n");
-               m_free(m);
-               goto freeit;
-       }
-       m->m_len = icmplen + ICMP_MINLEN;
+       m->m_len = icmplen + ICMP_MINLEN; /* for ICMP header and data */
         MH_ALIGN(m, m->m_len);
         icp = mtod(m, struct icmp *);
-       if ((u_int)type > ICMP_MAXTYPE)
-               panic("icmp_error");
+       if ((u_int)type > ICMP_MAXTYPE) {
+               m_freem(m);
+               goto freeit;
+       }
         icmpstat.icps_outhist[type]++;
         icp->icmp_type = type;
         if (type == ICMP_REDIRECT)
@@ -290,8 +345,10 @@ icmp_error(
          * Now, copy old ip header (without options)
          * in front of icmp message.
          */
-       if (m->m_data - sizeof(struct ip) < m->m_pktdat)
-               panic("icmp len");
+       if (m->m_data - sizeof(struct ip) < m->m_pktdat) {
+               m_freem(m);
+               goto freeit;
+       }
         m->m_data -= sizeof(struct ip);
         m->m_len += sizeof(struct ip);
         m->m_pkthdr.len = m->m_len;
@@ -302,6 +359,7 @@ icmp_error(
         nip->ip_vhl = IP_VHL_BORING;
         nip->ip_p = IPPROTO_ICMP;
         nip->ip_tos = 0;
+       nip->ip_off = 0;
         icmp_reflect(m);
  
  freeit:
@@ -856,7 +914,7 @@ icmp_send(struct mbuf *m, struct mbuf *opts)
         ROUTE_RELEASE(&ro);
  }
  
-n_time
+u_int32_t
  iptime(void)
  {
         struct timeval atv;
diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c

index 9dde8f80e93f811a2328efa980453d7e6a0a0a7e..bc1bb0f2f0f64846306fcf763e3c625a38d74da7 100644 (file)
--- a/bsd/netinet/ip_input.c
+++ b/bsd/netinet/ip_input.c
@@ -2041,7 +2041,7 @@ ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop)
         struct in_ifaddr *ia;
         int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
         struct in_addr *sin, dst;
-       n_time ntime;
+       u_int32_t ntime;
         struct sockaddr_in ipaddr = {
             sizeof (ipaddr), AF_INET, 0, { 0 }, { 0, } };
  
@@ -2305,8 +2305,6 @@ nosourcerouting:
         }
         return (0);
  bad:
-       /* XXX icmp_error adds in hdr length */
-       ip->ip_len -= IP_VHL_HL(ip->ip_vhl) << 2;
         icmp_error(m, type, code, 0, 0);
         OSAddAtomic(1, &ipstat.ips_badoptions);
         return (1);
diff --git a/bsd/netinet/tcp_cubic.c b/bsd/netinet/tcp_cubic.c

index 7e2d00b07e4839aaeb72522d9695fc2e89f2c901..2eb86f1a91e8645e4f21ff5ac4f5fad74b26bfa1 100644 (file)
--- a/bsd/netinet/tcp_cubic.c
+++ b/bsd/netinet/tcp_cubic.c
@@ -161,8 +161,10 @@ tcp_cubic_update(struct tcpcb *tp, u_int32_t rtt)
         float K, var;
         u_int32_t elapsed_time, win;
  
-       VERIFY(tp->t_ccstate->cub_last_max > 0);
         win = min(tp->snd_cwnd, tp->snd_wnd);
+       if (tp->t_ccstate->cub_last_max == 0)
+               tp->t_ccstate->cub_last_max = tp->snd_ssthresh;
+
         if (tp->t_ccstate->cub_epoch_start == 0) {
                 /*
                  * This is the beginning of a new epoch, initialize some of
diff --git a/bsd/netinet/tcp_debug.h b/bsd/netinet/tcp_debug.h

index 0cfb8d9533c6bb42fd8a32f5e6251dfeac224023..d7a7130a518139f432ed09f330f65f167e0b8c5d 100644 (file)
--- a/bsd/netinet/tcp_debug.h
+++ b/bsd/netinet/tcp_debug.h
@@ -67,7 +67,7 @@
  #ifdef PRIVATE
  
  struct tcp_debug {
-       n_time  td_time;
+       u_int32_t td_time;
         short   td_act;
         short   td_ostate;
         caddr_t td_tcb;
diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c

index ee974e44ec5973c24c3766c9f2bd04688d04140d..53e362c89e16b1d5f86f5cce4fc2863054464bb3 100644 (file)
--- a/bsd/netinet6/in6_proto.c
+++ b/bsd/netinet6/in6_proto.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2008-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   *
@@ -476,7 +476,7 @@ u_int32_t   rip6_sendspace = RIPV6SNDQ;
  u_int32_t      rip6_recvspace = RIPV6RCVQ;
  
  /* ICMPV6 parameters */
-int    icmp6_rediraccept = 1;          /* accept and process redirects */
+int    icmp6_rediraccept = 0;          /* accept and process redirects */
  int    icmp6_redirtimeout = 10 * 60;   /* 10 minutes */
  int    icmp6errppslim = 500;           /* 500 packets per second */
  int    icmp6rappslim = 10;             /* 10 packets per second */
diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c

index 266547020ffb7c39857b752369126a1a50f3b496..4cc199ca8bc0b53ee9ac2c6225b599d2199a3ec7 100644 (file)
--- a/bsd/netinet6/ip6_input.c
+++ b/bsd/netinet6/ip6_input.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2003-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   *
@@ -605,27 +605,15 @@ ip6_input(struct mbuf *m)
         }
  
         ip6stat.ip6s_nxthist[ip6->ip6_nxt]++;
-
-#if IPFW2
-       /*
-        * Check with the firewall...
-        */
-       if (ip6_fw_enable && ip6_fw_chk_ptr) {
-               u_short port = 0;
-               /* If ipfw says divert, we have to just drop packet */
-               /* use port as a dummy argument */
-               if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) {
-                       m_freem(m);
-                       m = NULL;
-               }
-               if (!m)
-                       goto done;
-       }
-#endif /* IPFW2 */
-
         /*
          * Check against address spoofing/corruption.
          */
+       if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP) &&
+           IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src)) {
+               ip6stat.ip6s_badscope++;
+               in6_ifstat_inc(inifp, ifs6_in_addrerr);
+               goto bad;
+       }
         if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
             IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
                 /*
@@ -681,6 +669,22 @@ ip6_input(struct mbuf *m)
                 goto bad;
         }
  #endif
+#if IPFW2
+        /*
+         * Check with the firewall...
+         */
+        if (ip6_fw_enable && ip6_fw_chk_ptr) {
+                u_short port = 0;
+                /* If ipfw says divert, we have to just drop packet */
+                /* use port as a dummy argument */
+                if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) {
+                        m_freem(m);
+                        m = NULL;
+                }
+                if (!m)
+                        goto done;
+        }
+#endif /* IPFW2 */
  
         /*
          * Naively assume we can attribute inbound data to the route we would
diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c

index a58e5d866ab4899add3726b19fa39f0b408e2420..c0715a2b0267b0e962183eeffc569ceea733cda3 100644 (file)
--- a/bsd/nfs/nfs_bio.c
+++ b/bsd/nfs/nfs_bio.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -3797,7 +3797,7 @@ again:
                 }
         }
  
-       if (req->r_achain.tqe_next == NFSREQNOLIST)
+       if (req->r_achain.tqe_next == NFSREQNOLIST || req->r_achain.tqe_next == NFSIODCOMPLETING)
                 TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
  
         /* If this mount doesn't already have an nfsiod working on it... */
diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c

index ad7d5a2714ea3e5da2b95cbd8b0de4ae5f5a066d..49d487f53d438338a5131fdb13fc60447392758a 100644 (file)
--- a/bsd/nfs/nfs_vfsops.c
+++ b/bsd/nfs/nfs_vfsops.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -4302,8 +4302,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
                 nfs4_mount_callback_shutdown(nmp);
  
         /* Destroy any RPCSEC_GSS contexts */
-       if (!TAILQ_EMPTY(&nmp->nm_gsscl))
-               nfs_gss_clnt_ctx_unmount(nmp);
+       nfs_gss_clnt_ctx_unmount(nmp);
  
         /* mark the socket for termination */
         lck_mtx_lock(&nmp->nm_lock);
diff --git a/bsd/sys/Makefile b/bsd/sys/Makefile

index 880af7e3d08f4123bc3a7077055c47e7f0b59e16..30a5166b8993667706adaad2cd9b65119f3290d1 100644 (file)
--- a/bsd/sys/Makefile
+++ b/bsd/sys/Makefile
@@ -135,7 +135,7 @@ EXPORT_MI_LIST      = ${KERNELFILES} ${PRIVATE_KERNELFILES} linker_set.h bsdtask_info
                                                                 vnode_internal.h proc_internal.h file_internal.h mount_internal.h \
                                                                 uio_internal.h tree.h munge.h kern_tests.h
  
-EXPORT_MI_GEN_LIST = syscall.h sysproto.h
+EXPORT_MI_GEN_LIST = syscall.h sysproto.h kdebugevents.h
  
  EXPORT_MI_DIR = sys
  
@@ -150,9 +150,10 @@ INSTALL_KF_MI_LCL_GEN_LIST = sysproto.h
  # /System/Library/Frameworks/Kernel.framework/Headers
  INSTALL_KF_MI_LIST = ${KERNELFILES}
  
-INSTALL_KF_MI_GEN_LIST = 
+INSTALL_KF_MI_GEN_LIST =
  
  MAKESYSCALLS = $(SRCROOT)/bsd/kern/makesyscalls.sh
+MAKEKDEBUGEVENTS = $(SRCROOT)/bsd/kern/makekdebugevents.py
  
  $(OBJROOT)/cscope.genhdrs:
         $(_v)mkdir -p $(OBJROOT)/cscope.genhdrs
@@ -167,6 +168,11 @@ sysproto.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) $(OBJROOT)/cscop
         @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path
         $(_v)$(MAKESYSCALLS) $< proto > /dev/null
  
+kdebugevents.h:  $(SRCROOT)/bsd/kern/trace.codes $(MAKEKDEBUGEVENTS) $(OBJROOT)/cscope.genhdrs
+       @echo "Generating bsd/kern/$@ from $<";
+       @echo "$(OBJPATH)/bsd/kern/$@" > $(OBJROOT)/cscope.genhdrs/$@.path
+       $(_v)$(MAKEKDEBUGEVENTS) $< > "$(OBJPATH)/bsd/sys/$@"
+
  MAKE_POSIX_AVAILABILITY = $(SRCROOT)/bsd/sys/make_posix_availability.sh
  _posix_availability.h: $(MAKE_POSIX_AVAILABILITY)
         @echo "Generating bsd/sys/$@"
diff --git a/bsd/sys/dtrace.h b/bsd/sys/dtrace.h

index fa41389d41657b83ca06039ebc72f38c188b482f..3e39fca6a4be23023d72ba0f48d64d1db1746795 100644 (file)
--- a/bsd/sys/dtrace.h
+++ b/bsd/sys/dtrace.h
@@ -26,6 +26,8 @@
  /*
   * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
+ *
+ * Portions Copyright (c) 2012 by Delphix. All rights reserved.
   */
  
  #ifndef _SYS_DTRACE_H
@@ -103,6 +105,7 @@ extern "C" {
  
  #define S_ROUND(x, a)   ((x) + (((a) ? (a) : 1) - 1) & ~(((a) ? (a) : 1) - 1))
  #define P2ROUNDUP(x, align)             (-(-(x) & -(align)))
+#define        P2PHASEUP(x, align, phase)      ((phase) - (((phase) - (x)) & -(align)))
  
  #define        CTF_MODEL_ILP32 1       /* object data model is ILP32 */
  #define        CTF_MODEL_LP64  2       /* object data model is LP64 */
@@ -1046,10 +1049,10 @@ typedef struct dtrace_ecbdesc {
   * DTrace Metadata Description Structures
   *
   * DTrace separates the trace data stream from the metadata stream.  The only
- * metadata tokens placed in the data stream are enabled probe identifiers
- * (EPIDs) or (in the case of aggregations) aggregation identifiers.  In order
- * to determine the structure of the data, DTrace consumers pass the token to
- * the kernel, and receive in return a corresponding description of the enabled
+ * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID +
+ * timestamp) or (in the case of aggregations) aggregation identifiers.  To
+ * determine the structure of the data, DTrace consumers pass the token to the
+ * kernel, and receive in return a corresponding description of the enabled
   * probe (via the dtrace_eprobedesc structure) or the aggregation (via the
   * dtrace_aggdesc structure).  Both of these structures are expressed in terms
   * of record descriptions (via the dtrace_recdesc structure) that describe the
@@ -1147,11 +1150,12 @@ typedef struct dtrace_fmtdesc {
  #define        DTRACEOPT_AGGHIST       27      /* histogram aggregation output */
  #define        DTRACEOPT_AGGPACK       28      /* packed aggregation output */
  #define        DTRACEOPT_AGGZOOM       29      /* zoomed aggregation scaling */
+#define        DTRACEOPT_TEMPORAL      30      /* temporally ordered output */
  #if !defined(__APPLE__)
-#define DTRACEOPT_MAX           30      /* number of options */
-#else
-#define DTRACEOPT_STACKSYMBOLS  30      /* clear to prevent stack symbolication */
  #define DTRACEOPT_MAX           31      /* number of options */
+#else
+#define DTRACEOPT_STACKSYMBOLS  31      /* clear to prevent stack symbolication */
+#define DTRACEOPT_MAX           32      /* number of options */
  #endif /* __APPLE__ */
  
  #define        DTRACEOPT_UNSET         (dtrace_optval_t)-2     /* unset option */
@@ -1172,7 +1176,9 @@ typedef struct dtrace_fmtdesc {
   * where user-level wishes the kernel to snapshot the buffer to (the
   * dtbd_data field).  The kernel uses the same structure to pass back some
   * information regarding the buffer:  the size of data actually copied out, the
- * number of drops, the number of errors, and the offset of the oldest record.
+ * number of drops, the number of errors, the offset of the oldest record,
+ * and the time of the snapshot.
+ *
   * If the buffer policy is a "switch" policy, taking a snapshot of the
   * principal buffer has the additional effect of switching the active and
   * inactive buffers.  Taking a snapshot of the aggregation buffer _always_ has
@@ -1185,8 +1191,29 @@ typedef struct dtrace_bufdesc {
          uint64_t dtbd_drops;                    /* number of drops */
          DTRACE_PTR(char, dtbd_data);            /* data */
          uint64_t dtbd_oldest;                   /* offset of oldest record */
+       uint64_t dtbd_timestamp;                /* hrtime of snapshot */
  } dtrace_bufdesc_t;
  
+/*
+ * Each record in the buffer (dtbd_data) begins with a header that includes
+ * the epid and a timestamp.  The timestamp is split into two 4-byte parts
+ * so that we do not require 8-byte alignment.
+ */
+typedef struct dtrace_rechdr {
+       dtrace_epid_t dtrh_epid;                /* enabled probe id */
+       uint32_t dtrh_timestamp_hi;             /* high bits of hrtime_t */
+       uint32_t dtrh_timestamp_lo;             /* low bits of hrtime_t */
+} dtrace_rechdr_t;
+
+#define        DTRACE_RECORD_LOAD_TIMESTAMP(dtrh)                      \
+       ((dtrh)->dtrh_timestamp_lo +                            \
+       ((uint64_t)(dtrh)->dtrh_timestamp_hi << 32))
+
+#define        DTRACE_RECORD_STORE_TIMESTAMP(dtrh, hrtime) {           \
+       (dtrh)->dtrh_timestamp_lo = (uint32_t)hrtime;           \
+       (dtrh)->dtrh_timestamp_hi = hrtime >> 32;               \
+}
+
  /*
   * DTrace Status
   *
diff --git a/bsd/sys/dtrace_impl.h b/bsd/sys/dtrace_impl.h

index 71dc020f2985b08c34a58cbea9f8e86a9cc287d2..cbb14c0abd651fcade830c0dbb5fff44da1acf20 100644 (file)
--- a/bsd/sys/dtrace_impl.h
+++ b/bsd/sys/dtrace_impl.h
@@ -22,6 +22,8 @@
  /*
   * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
+ *
+ * Portions Copyright (c) 2012 by Delphix. All rights reserved.
   */
  
  #ifndef _SYS_DTRACE_IMPL_H
@@ -202,15 +204,18 @@ typedef struct dtrace_hash {
   * predicate is non-NULL, the DIF object is executed.  If the result is
   * non-zero, the action list is processed, with each action being executed
   * accordingly.  When the action list has been completely executed, processing
- * advances to the next ECB.  processing advances to the next ECB.  If the
- * result is non-zero; For each ECB, it first determines the The ECB
- * abstraction allows disjoint consumers to multiplex on single probes.
+ * advances to the next ECB. The ECB abstraction allows disjoint consumers
+ * to multiplex on single probes.
+ *
+ * Execution of the ECB results in consuming dte_size bytes in the buffer
+ * to record data.  During execution, dte_needed bytes must be available in
+ * the buffer.  This space is used for both recorded data and tuple data.
   */
  struct dtrace_ecb {
         dtrace_epid_t dte_epid;                 /* enabled probe ID */
         uint32_t dte_alignment;                 /* required alignment */
-       size_t dte_needed;                      /* bytes needed */
-       size_t dte_size;                        /* total size of payload */
+       size_t dte_needed;                      /* space needed for execution */
+       size_t dte_size;                        /* size of recorded payload */
         dtrace_predicate_t *dte_predicate;      /* predicate, if any */
         dtrace_action_t *dte_action;            /* actions, if any */
         dtrace_ecb_t *dte_next;                 /* next ECB on probe */
@@ -268,27 +273,30 @@ typedef struct dtrace_aggregation {
   * the EPID, the consumer can determine the data layout.  (The data buffer
   * layout is shown schematically below.)  By assuring that one can determine
   * data layout from the EPID, the metadata stream can be separated from the
- * data stream -- simplifying the data stream enormously.
- *
- *      base of data buffer --->  +------+--------------------+------+
- *                                | EPID | data               | EPID |
- *                                +------+--------+------+----+------+
- *                                | data          | EPID | data      |
- *                                +---------------+------+-----------+
- *                                | data, cont.                      |
- *                                +------+--------------------+------+
- *                                | EPID | data               |      |
- *                                +------+--------------------+      |
- *                                |                ||                |
- *                                |                ||                |
- *                                |                \/                |
- *                                :                                  :
- *                                .                                  .
- *                                .                                  .
- *                                .                                  .
- *                                :                                  :
- *                                |                                  |
- *     limit of data buffer --->  +----------------------------------+
+ * data stream -- simplifying the data stream enormously.  The ECB always
+ * proceeds the recorded data as part of the dtrace_rechdr_t structure that
+ * includes the EPID and a high-resolution timestamp used for output ordering
+ * consistency.
+ *
+ *      base of data buffer --->  +--------+--------------------+--------+
+ *                                | rechdr | data               | rechdr |
+ *                                +--------+------+--------+----+--------+
+ *                                | data          | rechdr | data        |
+ *                                +---------------+--------+-------------+
+ *                                | data, cont.                          |
+ *                                +--------+--------------------+--------+
+ *                                | rechdr | data               |        |
+ *                                +--------+--------------------+        |
+ *                                |                ||                    |
+ *                                |                ||                    |
+ *                                |                \/                    |
+ *                                :                                      :
+ *                                .                                      .
+ *                                .                                      .
+ *                                .                                      .
+ *                                :                                      :
+ *                                |                                      |
+ *     limit of data buffer --->  +--------------------------------------+
   *
   * When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the
   * principal buffer (both scratch and payload) exceed the available space.  If
@@ -426,6 +434,8 @@ typedef struct dtrace_buffer {
  #ifndef _LP64
         uint64_t dtb_pad1;
  #endif
+       uint64_t dtb_switched;                  /* time of last switch */
+       uint64_t dtb_interval;                  /* observed switch interval */
  } dtrace_buffer_t;
  
  /*
diff --git a/bsd/sys/event.h b/bsd/sys/event.h

index 66efc61b012cce808b79076cbfd4d1eaf0fd4db9..44cef5438af4fb00afefeb29a56aabd303fd1b32 100644 (file)
--- a/bsd/sys/event.h
+++ b/bsd/sys/event.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2003-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2015 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -177,14 +177,20 @@ struct kevent64_s {
   * (which always returns true for regular files - regardless of the amount
   * of unread data in the file).
   *
- * On input, EV_OOBAND specifies that only OOB data should be looked for.
- * The returned data count is the number of bytes beyond the current OOB marker.
+ * On input, EV_OOBAND specifies that filter should actively return in the
+ * presence of OOB on the descriptor. It implies that filter will return
+ * if there is OOB data available to read OR when any other condition
+ * for the read are met (for example number of bytes regular data becomes >=
+ * low-watermark).
+ * If EV_OOBAND is not set on input, it implies that the filter should not actively
+ * return for out of band data on the descriptor. The filter will then only return
+ * when some other condition for read is met (ex: when number of regular data bytes
+ * >=low-watermark OR when socket can't receive more data (SS_CANTRCVMORE)).
   *
- * On output, EV_OOBAND indicates that OOB data is present
+ * On output, EV_OOBAND indicates the presence of OOB data on the descriptor.
   * If it was not specified as an input parameter, then the data count is the
- * number of bytes before the current OOB marker. If at the marker, the
- * data count indicates the number of bytes available after it.  In either
- * case, it's the amount of data one could expect to receive next.
+ * number of bytes before the current OOB marker, else data count is the number
+ * of bytes beyond OOB marker.
   */
  #define EV_POLL        EV_FLAG0
  #define EV_OOBAND      EV_FLAG1
@@ -474,7 +480,7 @@ extern int  knote_link_wait_queue(struct knote *kn, struct wait_queue *wq, wait_q
  extern int     knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t *wqlp);
  extern void    knote_fdclose(struct proc *p, int fd);
  extern void    knote_markstayqueued(struct knote *kn);
-
+extern void    knote_clearstayqueued(struct knote *kn);
  #endif /* !KERNEL_PRIVATE */
  
  #else  /* KERNEL */
diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h

index 186ace8f3818a4f23332dd92e690949828f71c45..af75e23a1033a424531bd244e3692e95d98757a2 100644 (file)
--- a/bsd/sys/kdebug.h
+++ b/bsd/sys/kdebug.h
@@ -291,15 +291,15 @@ extern int kdebug_trace(uint32_t code, uint64_t arg1, uint64_t arg2, uint64_t ar
  /* Codes for Selective Forced Idle (DBG_MACH_SFI) */
  #define SFI_SET_WINDOW                 0x0
  #define SFI_CANCEL_WINDOW              0x1
-#define SFI_SET_CLASS_OFFTIME  0x2
+#define SFI_SET_CLASS_OFFTIME          0x2
  #define SFI_CANCEL_CLASS_OFFTIME       0x3
  #define SFI_THREAD_DEFER               0x4
  #define SFI_OFF_TIMER                  0x5
  #define SFI_ON_TIMER                   0x6
  #define SFI_WAIT_CANCELED              0x7
  #define SFI_PID_SET_MANAGED            0x8
-#define SFI_PID_CLEAR_MANAGED  0x9
-
+#define SFI_PID_CLEAR_MANAGED          0x9
+#define SFI_GLOBAL_DEFER               0xa
  /* **** The Kernel Debug Sub Classes for Network (DBG_NETWORK) **** */
  #define DBG_NETIP      1       /* Internet Protocol */
  #define DBG_NETARP     2       /* Address Resolution Protocol */
@@ -462,11 +462,17 @@ extern int kdebug_trace(uint32_t code, uint64_t arg1, uint64_t arg2, uint64_t ar
  #define DBG_TRACE_STRING    1
  #define        DBG_TRACE_INFO      2
  
-/*
- * TRACE_DATA_NEWTHREAD                        0x1
- * TRACE_DATA_EXEC                     0x2
- */
-#define TRACE_DATA_THREAD_TERMINATE    0x3     /* thread has been queued for deallocation and can no longer run */
+/* The Kernel Debug events: */
+#define        TRACE_DATA_NEWTHREAD            (TRACEDBG_CODE(DBG_TRACE_DATA, 1))
+#define        TRACE_DATA_EXEC                 (TRACEDBG_CODE(DBG_TRACE_DATA, 2))
+#define        TRACE_DATA_THREAD_TERMINATE     (TRACEDBG_CODE(DBG_TRACE_DATA, 3))
+#define        TRACE_STRING_NEWTHREAD          (TRACEDBG_CODE(DBG_TRACE_STRING, 1))
+#define        TRACE_STRING_EXEC               (TRACEDBG_CODE(DBG_TRACE_STRING, 2))
+#define        TRACE_PANIC                     (TRACEDBG_CODE(DBG_TRACE_INFO, 0))
+#define        TRACE_TIMESTAMPS                (TRACEDBG_CODE(DBG_TRACE_INFO, 1))
+#define        TRACE_LOST_EVENTS               (TRACEDBG_CODE(DBG_TRACE_INFO, 2))
+#define        TRACE_WRITING_EVENTS            (TRACEDBG_CODE(DBG_TRACE_INFO, 3))
+#define        TRACE_INFO_STRING               (TRACEDBG_CODE(DBG_TRACE_INFO, 4))
  
  /* The Kernel Debug Sub Classes for DBG_CORESTORAGE */
  #define DBG_CS_IO      0
@@ -638,6 +644,7 @@ extern unsigned int kdebug_enable;
  #define KDEBUG_ENABLE_ENTROPY 0x2              /* Obsolescent */
  #define KDEBUG_ENABLE_CHUD    0x4
  #define KDEBUG_ENABLE_PPT     0x8
+#define KDEBUG_ENABLE_SERIAL 0x10
  
  /*
   * Infer the supported kernel debug event level from config option.
@@ -1053,6 +1060,14 @@ typedef struct {
  /* Minimum value allowed when setting decrementer ticks */
  #define KDBG_MINRTCDEC  2500
  
+/* VFS lookup events for serial traces */
+#define VFS_LOOKUP     (FSDBG_CODE(DBG_FSRW,36))
+#define VFS_LOOKUP_DONE        (FSDBG_CODE(DBG_FSRW,39))
+
+#if (DEVELOPMENT || DEBUG)
+#define KDEBUG_MOJO_TRACE 1
+#endif
+
  #endif /* __APPLE_API_PRIVATE */
  #endif /* PRIVATE */
  
diff --git a/bsd/vfs/vfs_fsevents.c b/bsd/vfs/vfs_fsevents.c

index fd363613211bb670bb456ddee59785554f7f3e7e..6a90031c6798426385818b020a232d2eaf75d9e5 100644 (file)
--- a/bsd/vfs/vfs_fsevents.c
+++ b/bsd/vfs/vfs_fsevents.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2004-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2014 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -643,6 +643,7 @@ add_fsevent(int type, vfs_context_t ctx, ...)
                 VATTR_WANTED(&va, va_mode);
                 VATTR_WANTED(&va, va_uid);
                 VATTR_WANTED(&va, va_gid);
+               VATTR_WANTED(&va, va_nlink);
                 if ((ret = vnode_getattr(vp, &va, vfs_context_kernel())) != 0) {
                     // printf("add_fsevent: failed to getattr on vp %p (%d)\n", cur->fref.vp, ret);
                     cur->str = NULL;
@@ -655,6 +656,12 @@ add_fsevent(int type, vfs_context_t ctx, ...)
                 cur->mode = (int32_t)vnode_vttoif(vnode_vtype(vp)) | va.va_mode;
                 cur->uid  = va.va_uid;
                 cur->gid  = va.va_gid;
+               if (vp->v_flag & VISHARDLINK) {
+                       cur->mode |= FSE_MODE_HLINK;
+                       if ((vp->v_type == VDIR && va.va_dirlinkcount == 0) || (vp->v_type == VREG && va.va_nlink == 0)) {
+                               cur->mode |= FSE_MODE_LAST_HLINK;
+                       }
+               }
  
                 // if we haven't gotten the path yet, get it.
                 if (pathbuff == NULL) {
diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c

index 579542e1652dd6300d929a1f99855e5de150c316..4beff12a6e9c4364e61e38158991e94e3b74d066 100644 (file)
--- a/bsd/vfs/vfs_lookup.c
+++ b/bsd/vfs/vfs_lookup.c
@@ -1679,9 +1679,9 @@ kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, boolean_t l
          * entries, we must mark the start of the path's string and the end.
          */
         if (lookup == TRUE)
-               code = (FSDBG_CODE(DBG_FSRW,36)) | DBG_FUNC_START;
+               code = VFS_LOOKUP | DBG_FUNC_START;
         else
-               code = (FSDBG_CODE(DBG_FSRW,39)) | DBG_FUNC_START;
+               code = VFS_LOOKUP_DONE | DBG_FUNC_START;
  
         if (dbg_namelen <= (int)(3 * sizeof(long)))
                 code |= DBG_FUNC_END;
diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c

index 521cb571366c8f1cc3490b021a40fe5666064802..ba37a4e38abddf71ed2b369c4e70e114928dc11e 100644 (file)
--- a/bsd/vfs/vfs_syscalls.c
+++ b/bsd/vfs/vfs_syscalls.c
@@ -9235,6 +9235,24 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
         is64bit = proc_is64bit(p);
  
         memp = NULL;
+
+       /*
+        * ensure the buffer is large enough for underlying calls
+        */
+#ifndef HFSIOC_GETPATH
+typedef char pn_t[MAXPATHLEN];
+#define HFSIOC_GETPATH  _IOWR('h', 13, pn_t)
+#endif
+
+#ifndef HFS_GETPATH
+#define HFS_GETPATH  IOCBASECMD(HFSIOC_GETPATH)
+#endif
+       if (IOCBASECMD(cmd) == HFS_GETPATH) {
+               /* Round up to MAXPATHLEN regardless of user input */
+               size = MAXPATHLEN;
+       }
+
+
         if (size > sizeof (stkbuf)) {
                 if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
                 data = memp;
diff --git a/config/MasterVersion b/config/MasterVersion

index 1e84dc0e92931851c7a13c1e503fd2b532ff8664..747e47ed953022119eacd22dd20cfed9fb68273d 100644 (file)
--- a/config/MasterVersion
+++ b/config/MasterVersion
@@ -1,4 +1,4 @@
-14.1.0
+14.3.0
  
  # The first line of this file contains the master version number for the kernel.
  # All other instances of the kernel version in xnu are derived from this file.
diff --git a/config/Private.exports b/config/Private.exports

index 173309543942dcab8e399626e25ecedde9908ced..d7995f81e141003d5fdd9158a433591ae30f0c26 100644 (file)
--- a/config/Private.exports
+++ b/config/Private.exports
@@ -24,10 +24,20 @@ _buf_create_shadow
  _buf_kernel_addrperm_addr
  _buf_setfilter
  _buf_shadow
+_bufattr_alloc
+_bufattr_dup
+_bufattr_free
+_bufattr_greedymode
+_bufattr_isochronous
+_bufattr_markgreedymode
+_bufattr_markisochronous
+_bufattr_markmeta
+_bufattr_markquickcomplete
  _bufattr_meta
  _bufattr_nocache
-_bufattr_throttled
  _bufattr_passive
+_bufattr_quickcomplete
+_bufattr_throttled
  _cdevsw
  _cdevsw_setkqueueok
  _chudxnu_platform_ptr
diff --git a/libsyscall/mach/.gitignore b/libsyscall/mach/.gitignore

new file mode 100644 (file)

index 0000000..f718d68
--- /dev/null
+++ b/libsyscall/mach/.gitignore
@@ -0,0 +1,3 @@
+*.pbxuser
+*.perspectivev3
+build/
diff --git a/osfmk/atm/atm.c b/osfmk/atm/atm.c

index f1ba710627fde0b99002eb687fc91c4d2d08ecf5..92e9547bfbab77f865fe85a984ed0535a1bbcd55 100644 (file)
--- a/osfmk/atm/atm.c
+++ b/osfmk/atm/atm.c
@@ -337,6 +337,9 @@ atm_get_value(
                                 if (kr != KERN_SUCCESS) {
                                         break;
                                 }
+                       } else {
+                               kr = KERN_INVALID_TASK;
+                               break;
                         }
  
                         /* Increment sync value. */
@@ -939,8 +942,8 @@ atm_listener_insert(
                          */
                         next->mailbox = mailbox;
                         lck_mtx_unlock(&atm_value->listener_lock);
-                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_REPLACED))) | DBG_FUNC_NONE,
-                               atm_value, atm_value->aid, mailbox_offset, 0, 0);
+                       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_REPLACED))) | DBG_FUNC_NONE,
+                               VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0);
  
                         /* Drop the extra reference on task descriptor taken by this function. */
                         atm_task_descriptor_dealloc(task_descriptor);
@@ -948,8 +951,8 @@ atm_listener_insert(
                         return KERN_SUCCESS;
                 }
         }
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_ADDED))) | DBG_FUNC_NONE,
-                               atm_value, atm_value->aid, mailbox_offset, 0, 0);
+       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_GETVALUE_INFO, (ATM_VALUE_ADDED))) | DBG_FUNC_NONE,
+                               VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0);
  
         queue_enter(&atm_value->listeners, new_link_object, atm_link_object_t, listeners_element);
         atm_value->listener_count++;
@@ -1006,18 +1009,18 @@ atm_listener_delete(
  
                 if (elem->descriptor == task_descriptor) {
                         if (elem->mailbox == mailbox) {
-                               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_UNREGISTER_INFO,
+                               KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO,
                                         (ATM_VALUE_UNREGISTERED))) | DBG_FUNC_NONE,
-                                       atm_value, atm_value->aid, mailbox_offset, 0, 0);
+                                       VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, mailbox_offset, 0, 0);
                                 queue_remove(&atm_value->listeners, elem, atm_link_object_t, listeners_element);
                                 queue_enter(&free_listeners, elem, atm_link_object_t, listeners_element);
                                 atm_value->listener_count--;
                                 kr = KERN_SUCCESS;
                                 break;
                         } else {
-                               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_UNREGISTER_INFO,
+                               KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO,
                                         (ATM_VALUE_DIFF_MAILBOX))) | DBG_FUNC_NONE,
-                                       atm_value, atm_value->aid, 0, 0, 0);
+                                       VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, 0, 0, 0);
                                 kr = KERN_INVALID_VALUE;
                                 break;
                         }
@@ -1255,7 +1258,7 @@ atm_get_min_sub_aid_array(
         atm_value_t atm_value;
         uint32_t i;
  
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_START,
+       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_START,
                         0, 0, 0, 0, 0);
  
         for (i = 0; i < count; i++) {
@@ -1268,7 +1271,7 @@ atm_get_min_sub_aid_array(
                 atm_value_dealloc(atm_value);
         }
  
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_END,
+       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_CALLED))) | DBG_FUNC_END,
                         count, 0, 0, 0, 0);
  
  }
@@ -1292,7 +1295,7 @@ atm_get_min_sub_aid(atm_value_t atm_value)
         atm_link_object_t next, elem;
         queue_head_t free_listeners;
  
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_START,
+       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_START,
                         0, 0, 0, 0, 0);
  
         lck_mtx_lock(&atm_value->listener_lock);
@@ -1385,7 +1388,7 @@ atm_get_min_sub_aid(atm_value_t atm_value)
                 atm_link_dealloc(next);
         }
         
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_END,
+       KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_SUBAID_INFO, (ATM_MIN_LINK_LIST))) | DBG_FUNC_END,
                         j, freed_count, dead_but_not_freed, 0, 0);
  
         /* explicitly upgrade uint32_t to 64 bit mach size */
diff --git a/osfmk/device/device_init.c b/osfmk/device/device_init.c

index b935e14c62334b9c86a8b99743eedbd376e68161..8b00349ebde1cc4fd851a041e755b8c661209ade 100644 (file)
--- a/osfmk/device/device_init.c
+++ b/osfmk/device/device_init.c
@@ -80,6 +80,7 @@
  #include <device/device_port.h>
  
  ipc_port_t     master_device_port;
+void        *master_device_kobject;
  
  lck_grp_attr_t * dev_lck_grp_attr;
  lck_grp_t * dev_lck_grp;
@@ -93,8 +94,8 @@ device_service_create(void)
         if (master_device_port == IP_NULL)
             panic("can't allocate master device port");
  
-       ipc_kobject_set(master_device_port, 1, IKOT_MASTER_DEVICE);
-       kernel_set_special_port(host_priv_self(), HOST_IO_MASTER_PORT,
+    ipc_kobject_set(master_device_port, (ipc_kobject_t)&master_device_kobject, IKOT_MASTER_DEVICE);
+    kernel_set_special_port(host_priv_self(), HOST_IO_MASTER_PORT,
                                 ipc_port_make_send(master_device_port));
  
         /* allocate device lock group attribute and group */
diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c

index fb9157b88ec82daa09d848c6afc938b7ce96540a..35d4bf27973e946dd4ab5f2eb00ef4433383dfa0 100644 (file)
--- a/osfmk/i386/AT386/model_dep.c
+++ b/osfmk/i386/AT386/model_dep.c
@@ -74,6 +74,7 @@
  #include <mach/vm_prot.h>
  #include <mach/machine.h>
  #include <mach/time_value.h>
+#include <sys/kdebug.h>
  #include <kern/spl.h>
  #include <kern/assert.h>
  #include <kern/debug.h>
@@ -103,6 +104,7 @@
  #include <architecture/i386/pio.h> /* inb() */
  #include <pexpert/i386/boot.h>
  
+#include <kdp/kdp_dyld.h>
  #include <vm/pmap.h>
  #include <vm/vm_map.h>
  #include <vm/vm_kern.h>
@@ -127,11 +129,23 @@
  #endif
  
  static void machine_conf(void);
+void panic_print_symbol_name(vm_address_t search);
  
+extern boolean_t init_task_died;
+extern const char      version[];
+extern char    osversion[];
  extern int             max_unsafe_quanta;
  extern int             max_poll_quanta;
  extern unsigned int    panic_is_inited;
  
+extern int     proc_pid(void *p);
+
+/* Definitions for frame pointers */
+#define FP_ALIGNMENT_MASK      ((uint32_t)(0x3))
+#define FP_LR_OFFSET           ((uint32_t)4)
+#define FP_LR_OFFSET64         ((uint32_t)8)
+#define FP_MAX_NUM_TO_EVALUATE (50)
+
  int db_run_mode;
  
  volatile int pbtcpu = -1;
@@ -155,6 +169,93 @@ static unsigned    commit_paniclog_to_nvram;
  
  unsigned int debug_boot_arg;
  
+/*
+ * Backtrace a single frame.
+ */
+void
+print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker,
+       boolean_t is_64_bit, boolean_t nvram_format) 
+{
+       int                 i = 0;
+       addr64_t        lr;
+       addr64_t        fp;
+       addr64_t        fp_for_ppn;
+       ppnum_t         ppn;
+       boolean_t       dump_kernel_stack;
+
+       fp = topfp;
+       fp_for_ppn = 0;
+       ppn = (ppnum_t)NULL;
+
+       if (fp >= VM_MIN_KERNEL_ADDRESS)
+               dump_kernel_stack = TRUE;
+       else
+               dump_kernel_stack = FALSE;
+
+       do {
+               if ((fp == 0) || ((fp & FP_ALIGNMENT_MASK) != 0))
+                       break;
+               if (dump_kernel_stack && ((fp < VM_MIN_KERNEL_ADDRESS) || (fp > VM_MAX_KERNEL_ADDRESS)))
+                       break;
+               if ((!dump_kernel_stack) && (fp >=VM_MIN_KERNEL_ADDRESS))
+                       break;
+                       
+        /* Check to see if current address will result in a different
+           ppn than previously computed (to avoid recomputation) via
+           (addr) ^ fp_for_ppn) >> PAGE_SHIFT) */
+
+               if ((((fp + FP_LR_OFFSET) ^ fp_for_ppn) >> PAGE_SHIFT) != 0x0U) {
+                       ppn = pmap_find_phys(pmap, fp + FP_LR_OFFSET);
+                       fp_for_ppn = fp + (is_64_bit ? FP_LR_OFFSET64 : FP_LR_OFFSET);
+               }
+               if (ppn != (ppnum_t)NULL) {
+                       if (is_64_bit) {
+                               lr = ml_phys_read_double_64(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET64) & PAGE_MASK));
+                       } else {
+                               lr = ml_phys_read_word(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET) & PAGE_MASK));
+                       }
+               } else {
+                       if (is_64_bit) {
+                               kdb_printf("%s\t  Could not read LR from frame at 0x%016llx\n", cur_marker, fp + FP_LR_OFFSET64);
+                       } else {
+                               kdb_printf("%s\t  Could not read LR from frame at 0x%08x\n", cur_marker, (uint32_t)(fp + FP_LR_OFFSET));
+                       }
+                       break;
+               }
+               if (((fp ^ fp_for_ppn) >> PAGE_SHIFT) != 0x0U) {
+                       ppn = pmap_find_phys(pmap, fp);
+                       fp_for_ppn = fp;
+               }
+               if (ppn != (ppnum_t)NULL) {
+                       if (is_64_bit) {
+                               fp = ml_phys_read_double_64(((((vm_offset_t)ppn) << PAGE_SHIFT)) | (fp & PAGE_MASK));
+                       } else {
+                               fp = ml_phys_read_word(((((vm_offset_t)ppn) << PAGE_SHIFT)) | (fp & PAGE_MASK));
+                       }
+               } else {
+                       if (is_64_bit) {
+                               kdb_printf("%s\t  Could not read FP from frame at 0x%016llx\n", cur_marker, fp);
+                       } else {
+                               kdb_printf("%s\t  Could not read FP from frame at 0x%08x\n", cur_marker, (uint32_t)fp);
+                       }
+                       break;
+               }
+
+               if (nvram_format) {
+                       if (is_64_bit) {
+                               kdb_printf("%s\t0x%016llx\n", cur_marker, lr);
+                       } else {
+                               kdb_printf("%s\t0x%08x\n", cur_marker, (uint32_t)lr);
+                       }
+               } else {                
+                       if (is_64_bit) {
+                               kdb_printf("%s\t  lr: 0x%016llx  fp: 0x%016llx\n", cur_marker, lr, fp);
+                       } else {
+                               kdb_printf("%s\t  lr: 0x%08x  fp: 0x%08x\n", cur_marker, (uint32_t)lr, (uint32_t)fp);
+                       }
+               }
+       } while ((++i < FP_MAX_NUM_TO_EVALUATE) && (fp != topfp));
+}
  void
  machine_startup(void)
  {
@@ -171,6 +272,12 @@ machine_startup(void)
                 if (debug_boot_arg & DB_PRT) disable_debug_output=FALSE; 
                 if (debug_boot_arg & DB_SLOG) systemLogDiags=TRUE; 
                 if (debug_boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE;
+#if KDEBUG_MOJO_TRACE
+               if (debug_boot_arg & DB_PRT_KDEBUG) {
+                       kdebug_serial = TRUE;
+                       disable_debug_output = FALSE;
+               }
+#endif
         } else {
                 debug_boot_arg = 0;
         }
@@ -757,6 +864,16 @@ machine_halt_cpu(void) {
         pmCPUHalt(PM_HALT_DEBUG);
  }
  
+static int pid_from_task(task_t task)
+{
+        int pid = -1;
+
+        if (task->bsd_info)
+                pid = proc_pid(task->bsd_info);
+
+        return pid;
+}
+
  void
  DebuggerWithContext(
         __unused unsigned int   reason,
@@ -773,6 +890,9 @@ Debugger(
         unsigned long pi_size = 0;
         void *stackptr;
         int cn = cpu_number();
+       task_t task = current_task();
+       int     task_pid = pid_from_task(task);
+
  
         hw_atomic_add(&debug_mode, 1);   
         if (!panic_is_inited) {
@@ -802,7 +922,12 @@ Debugger(
                 __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr));
  
                 /* Print backtrace - callee is internally synchronized */
-               panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), NULL, FALSE, NULL);
+               if ((task_pid == 1) && (init_task_died)) {
+                       /* Special handling of launchd died panics */
+                       print_launchd_info();
+               } else {
+                       panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), NULL, FALSE, NULL);
+               }
  
                 /* everything should be printed now so copy to NVRAM
                  */
@@ -994,7 +1119,7 @@ panic_print_kmod_symbol_name(vm_address_t search)
      }
  }
  
-static void
+void
  panic_print_symbol_name(vm_address_t search)
  {
      /* try searching in the kernel */
@@ -1138,3 +1263,184 @@ out:
         bt_tsc_timeout = rdtsc64() + PBT_TIMEOUT_CYCLES;
         while(*ppbtcnt && (rdtsc64() < bt_tsc_timeout));
  }
+
+static boolean_t
+debug_copyin(pmap_t p, uint64_t uaddr, void *dest, size_t size)
+{
+        size_t rem = size;
+        char *kvaddr = dest;
+
+        while (rem) {
+                ppnum_t upn = pmap_find_phys(p, uaddr);
+                uint64_t phys_src = ptoa_64(upn) | (uaddr & PAGE_MASK);
+                uint64_t phys_dest = kvtophys((vm_offset_t)kvaddr);
+                uint64_t src_rem = PAGE_SIZE - (phys_src & PAGE_MASK);
+                uint64_t dst_rem = PAGE_SIZE - (phys_dest & PAGE_MASK);
+                size_t cur_size = (uint32_t) MIN(src_rem, dst_rem);
+                cur_size = MIN(cur_size, rem);
+
+                if (upn && pmap_valid_page(upn) && phys_dest) {
+                        bcopy_phys(phys_src, phys_dest, cur_size);
+                }
+                else
+                        break;
+                uaddr += cur_size;
+                kvaddr += cur_size;
+                rem -= cur_size;
+        }
+        return (rem == 0);
+}
+
+void
+print_threads_registers(thread_t thread)
+{
+       x86_saved_state_t *savestate;
+       
+       savestate = get_user_regs(thread);
+       kdb_printf(
+               "\nRAX: 0x%016llx, RBX: 0x%016llx, RCX: 0x%016llx, RDX: 0x%016llx\n"
+           "RSP: 0x%016llx, RBP: 0x%016llx, RSI: 0x%016llx, RDI: 0x%016llx\n"
+           "R8:  0x%016llx, R9:  0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n"
+               "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n"
+               "RFL: 0x%016llx, RIP: 0x%016llx, CS:  0x%016llx, SS:  0x%016llx\n\n",
+               savestate->ss_64.rax, savestate->ss_64.rbx, savestate->ss_64.rcx, savestate->ss_64.rdx,
+               savestate->ss_64.isf.rsp, savestate->ss_64.rbp, savestate->ss_64.rsi, savestate->ss_64.rdi,
+               savestate->ss_64.r8, savestate->ss_64.r9,  savestate->ss_64.r10, savestate->ss_64.r11,
+               savestate->ss_64.r12, savestate->ss_64.r13, savestate->ss_64.r14, savestate->ss_64.r15,
+               savestate->ss_64.isf.rflags, savestate->ss_64.isf.rip, savestate->ss_64.isf.cs,
+               savestate->ss_64.isf.ss);
+}
+
+void
+print_tasks_user_threads(task_t task)
+{
+       thread_t                thread = current_thread();
+       x86_saved_state_t *savestate;
+       pmap_t                  pmap = 0;
+       uint64_t                rbp;
+       const char              *cur_marker = 0;
+       int             j;
+       
+       for (j = 0, thread = (thread_t) queue_first(&task->threads); j < task->thread_count;
+                       ++j, thread = (thread_t) queue_next(&thread->task_threads)) {
+
+               kdb_printf("Thread  %p\n", thread);
+               pmap = get_task_pmap(task);
+               savestate = get_user_regs(thread);
+               rbp = savestate->ss_64.rbp;
+               print_one_backtrace(pmap, (vm_offset_t)rbp, cur_marker, TRUE, TRUE);
+               kdb_printf("\n");
+               }
+}
+
+#define PANICLOG_UUID_BUF_SIZE 256
+
+void print_uuid_info(task_t task)
+{
+       uint32_t                uuid_info_count = 0;
+       mach_vm_address_t       uuid_info_addr = 0;
+       boolean_t               have_map = (task->map != NULL) &&       (ml_validate_nofault((vm_offset_t)(task->map), sizeof(struct _vm_map)));
+       boolean_t               have_pmap = have_map && (task->map->pmap != NULL) && (ml_validate_nofault((vm_offset_t)(task->map->pmap), sizeof(struct pmap)));
+       int                             task_pid = pid_from_task(task);
+       char                    uuidbuf[PANICLOG_UUID_BUF_SIZE] = {0};
+       char                    *uuidbufptr = uuidbuf;
+       uint32_t                k;
+
+       if (have_pmap && task->active && task_pid > 0) {
+               /* Read dyld_all_image_infos struct from task memory to get UUID array count & location */
+               struct user64_dyld_all_image_infos task_image_infos;
+               if (debug_copyin(task->map->pmap, task->all_image_info_addr,
+                       &task_image_infos, sizeof(struct user64_dyld_all_image_infos))) {
+                       uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
+                       uuid_info_addr = task_image_infos.uuidArray;
+               }
+
+               /* If we get a NULL uuid_info_addr (which can happen when we catch dyld
+                * in the middle of updating this data structure), we zero the
+                * uuid_info_count so that we won't even try to save load info for this task
+                */
+               if (!uuid_info_addr) {
+                       uuid_info_count = 0;
+               }
+       }
+
+       if (task_pid > 0 && uuid_info_count > 0) {
+               uint32_t uuid_info_size = sizeof(struct user64_dyld_uuid_info);
+               uint32_t uuid_array_size = uuid_info_count * uuid_info_size;
+               uint32_t uuid_copy_size = 0;
+               uint32_t uuid_image_count = 0;
+               char *current_uuid_buffer = NULL;
+               /* Copy in the UUID info array. It may be nonresident, in which case just fix up nloadinfos to 0 */
+               
+               kdb_printf("\nuuid info:\n");
+               while (uuid_array_size) {
+                       if (uuid_array_size <= PANICLOG_UUID_BUF_SIZE) {
+                               uuid_copy_size = uuid_array_size;
+                               uuid_image_count = uuid_array_size/uuid_info_size;
+                       } else {
+                               uuid_image_count = PANICLOG_UUID_BUF_SIZE/uuid_info_size;
+                               uuid_copy_size = uuid_image_count * uuid_info_size;
+                       }
+                       if (have_pmap && !debug_copyin(task->map->pmap, uuid_info_addr, uuidbufptr,
+                               uuid_copy_size)) {
+                               kdb_printf("Error!! Failed to copy UUID info for task %p pid %d\n", task, task_pid);
+                               uuid_image_count = 0;
+                               break;
+                       }
+
+                       if (uuid_image_count > 0) {
+                               current_uuid_buffer = uuidbufptr;
+                               for (k = 0; k < uuid_image_count; k++) {
+                                       kdb_printf(" %#llx", *(uint64_t *)current_uuid_buffer);
+                                       current_uuid_buffer += sizeof(uint64_t);
+                                       uint8_t *uuid = (uint8_t *)current_uuid_buffer;
+                                       kdb_printf("\tuuid = <%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>\n",
+                                       uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], uuid[6], uuid[7], uuid[8],
+                                       uuid[9], uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]);
+                                       current_uuid_buffer += 16;
+                               }
+                               bzero(&uuidbuf, sizeof(uuidbuf));
+                       }
+                       uuid_info_addr += uuid_copy_size;
+                       uuid_array_size -= uuid_copy_size;
+               }
+       }
+}
+
+void print_launchd_info(void)
+{
+       task_t          task = current_task();
+       thread_t        thread = current_thread();
+       volatile        uint32_t *ppbtcnt = &pbtcnt;
+       uint64_t        bt_tsc_timeout;
+       int             cn = cpu_number();
+
+       if(pbtcpu != cn) {
+               hw_atomic_add(&pbtcnt, 1);
+               /* Spin on print backtrace lock, which serializes output
+                * Continue anyway if a timeout occurs.
+                */
+               hw_lock_to(&pbtlock, ~0U);
+               pbtcpu = cn;
+       }
+       
+       print_uuid_info(task);
+       print_threads_registers(thread);
+       print_tasks_user_threads(task);
+       kdb_printf("Mac OS version: %s\n", (osversion[0] != 0) ? osversion : "Not yet set");
+       kdb_printf("Kernel version: %s\n", version);
+       panic_display_kernel_uuid();
+       panic_display_model_name();
+       
+       /* Release print backtrace lock, to permit other callers in the
+        * event of panics on multiple processors.
+        */
+       hw_lock_unlock(&pbtlock);
+       hw_atomic_sub(&pbtcnt, 1);
+       /* Wait for other processors to complete output
+        * Timeout and continue after PBT_TIMEOUT_CYCLES.
+        */
+       bt_tsc_timeout = rdtsc64() + PBT_TIMEOUT_CYCLES;
+       while(*ppbtcnt && (rdtsc64() < bt_tsc_timeout));
+
+}
diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c

index e42d4aef22bbf9744fe52d54cb89853d50c96497..76617a0da90dd32591e606b7b4a20337cd6f8fb8 100644 (file)
--- a/osfmk/i386/acpi.c
+++ b/osfmk/i386/acpi.c
@@ -35,6 +35,9 @@
  #if CONFIG_MTRR
  #include <i386/mtrr.h>
  #endif
+#if HYPERVISOR
+#include <kern/hv_support.h>
+#endif
  #if CONFIG_VMX
  #include <i386/vmx/vmx_cpu.h>
  #endif
@@ -193,6 +196,11 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon)
         /* Save power management timer state */
         pmTimerSave();
  
+#if HYPERVISOR
+       /* Notify hypervisor that we are about to sleep */
+       hv_suspend();
+#endif
+
  #if CONFIG_VMX
         /* 
          * Turn off VT, otherwise switching to legacy mode will fail
diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c

index 532c49ee344ae9ec083e8d0eb736c432df924eea..9cfd5892b4c6322e5033f39314c7b2eae16b5447 100644 (file)
--- a/osfmk/i386/cpuid.c
+++ b/osfmk/i386/cpuid.c
@@ -762,12 +762,10 @@ cpuid_set_cpufamily(i386_cpu_info_t *info_p)
                 case CPUID_MODEL_CRYSTALWELL:
                         cpufamily = CPUFAMILY_INTEL_HASWELL;
                         break;
-#if !defined(XNU_HIDE_SEED)
                 case CPUID_MODEL_BROADWELL:
                 case CPUID_MODEL_BRYSTALWELL:
                         cpufamily = CPUFAMILY_INTEL_BROADWELL;
                         break;
-#endif /* not XNU_HIDE_SEED */
                 }
                 break;
         }
@@ -944,9 +942,7 @@ leaf7_feature_map[] = {
         {CPUID_LEAF7_FEATURE_RTM,      "RTM"},
         {CPUID_LEAF7_FEATURE_RDSEED,   "RDSEED"},
         {CPUID_LEAF7_FEATURE_ADX,      "ADX"},
-#if !defined(XNU_HIDE_SEED)
         {CPUID_LEAF7_FEATURE_SMAP,     "SMAP"},
-#endif /* not XNU_HIDE_SEED */
         {0, 0}
  };
  
diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h

index 1f58d5250ab404f60951e99e8ed7b78ab497e33b..980945d50a80ba4943166263530f07eefed497d4 100644 (file)
--- a/osfmk/i386/cpuid.h
+++ b/osfmk/i386/cpuid.h
@@ -135,9 +135,7 @@
  #define CPUID_LEAF7_FEATURE_RTM      _Bit(11)  /* RTM */
  #define CPUID_LEAF7_FEATURE_RDSEED   _Bit(18)  /* RDSEED Instruction */
  #define CPUID_LEAF7_FEATURE_ADX      _Bit(19)  /* ADX Instructions */
-#if !defined(XNU_HIDE_SEED)
  #define CPUID_LEAF7_FEATURE_SMAP     _Bit(20)  /* Supervisor Mode Access Protect */
-#endif /* not XNU_HIDE_SEED */
  
  /*
   * The CPUID_EXTFEATURE_XXX values define 64-bit values
@@ -203,12 +201,10 @@
  #define CPUID_MODEL_HASWELL            0x3C
  #define CPUID_MODEL_HASWELL_EP         0x3F
  #define CPUID_MODEL_HASWELL_ULT                0x45
-#if !defined(XNU_HIDE_SEED)
  #define CPUID_MODEL_BROADWELL          0x3D
  #define CPUID_MODEL_BROADWELL_ULX      0x3D
  #define CPUID_MODEL_BROADWELL_ULT      0x3D
  #define CPUID_MODEL_BRYSTALWELL                0x47
-#endif /* not XNU_HIDE_SEED */
  
  #define CPUID_VMM_FAMILY_UNKNOWN       0x0
  #define CPUID_VMM_FAMILY_VMWARE                0x1
diff --git a/osfmk/i386/panic_hooks.c b/osfmk/i386/panic_hooks.c

index 113031cfac737e181f2715655173227771eae530..e561a690026fc6abd94b6f194371e5b5f9887661 100644 (file)
--- a/osfmk/i386/panic_hooks.c
+++ b/osfmk/i386/panic_hooks.c
@@ -167,7 +167,7 @@ void panic_dump_mem(const void *addr, int len)
         }
  }
  
-bool panic_phys_range_before(const void *addr, uint64_t *pphys, 
+boolean_t panic_phys_range_before(const void *addr, uint64_t *pphys, 
                                                          panic_phys_range_t *range)
  {
         *pphys = kvtophys((vm_offset_t)addr);
diff --git a/osfmk/i386/panic_hooks.h b/osfmk/i386/panic_hooks.h

index 92905ebb402e030c58957a00b119d931c3eba86d..10b38e5758e47e465a155d83f94611cfd340ef95 100644 (file)
--- a/osfmk/i386/panic_hooks.h
+++ b/osfmk/i386/panic_hooks.h
@@ -32,7 +32,7 @@
  #if XNU_KERNEL_PRIVATE
  
  #include <stdint.h>
-#include <stdbool.h>
+#include <mach/i386/boolean.h>
  
  typedef struct {
         uint64_t        opaque[6];
@@ -53,7 +53,7 @@ typedef struct panic_phys_range {
         uint64_t len;
  } panic_phys_range_t;
  
-bool panic_phys_range_before(const void *addr, uint64_t *pphys, 
+boolean_t panic_phys_range_before(const void *addr, uint64_t *pphys, 
                                                          panic_phys_range_t *range);
  
  #endif // XNU_KERNEL_PRIVATE
diff --git a/osfmk/i386/proc_reg.h b/osfmk/i386/proc_reg.h

index 2229ab8a82ce1c2618d9419ce7a3e1d41bcba76d..ba0f1b1e5494eaf373df2dd7ef915ea55e474402 100644 (file)
--- a/osfmk/i386/proc_reg.h
+++ b/osfmk/i386/proc_reg.h
@@ -373,6 +373,16 @@ static inline void invlpg(uintptr_t addr)
         __asm__  volatile("invlpg (%0)" :: "r" (addr) : "memory");
  }
  
+static inline void clac(void)
+{
+       __asm__  volatile("clac");
+}
+
+static inline void stac(void)
+{
+       __asm__  volatile("stac");
+}
+
  /*
   * Access to machine-specific registers (available on 586 and better only)
   * Note: the rd* operations modify the parameters directly (without using
diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c

index 0cedaa19dd6d9268b06abb3a646c1142662d0caa..3a99e32a7de3cc328f70cf115a0a67db5e88cf05 100644 (file)
--- a/osfmk/i386/trap.c
+++ b/osfmk/i386/trap.c
@@ -624,6 +624,17 @@ kernel_trap(
                                         goto debugger_entry;
                                 }
  
+                               /*
+                                * Additionally check for SMAP faults...
+                                * which are characterized by page-present and
+                                * the AC bit unset (i.e. not from copyin/out path).
+                                */
+                               if (__improbable(code & T_PF_PROT &&
+                                                pmap_smap_enabled &&
+                                                (saved_state->isf.rflags & EFL_AC) == 0)) {
+                                       goto debugger_entry;
+                               }
+
                                 /*
                                  * If we're not sharing cr3 with the user
                                  * and we faulted in copyio,
@@ -802,6 +813,7 @@ panic_trap(x86_saved_state64_t *regs)
         const char      *trapname = "Unknown";
         pal_cr_t        cr0, cr2, cr3, cr4;
         boolean_t       potential_smep_fault = FALSE, potential_kernel_NX_fault = FALSE;
+       boolean_t       potential_smap_fault = FALSE;
  
         pal_get_control_registers( &cr0, &cr2, &cr3, &cr4 );
         assert(ml_get_interrupts_enabled() == FALSE);
@@ -826,6 +838,12 @@ panic_trap(x86_saved_state64_t *regs)
                 } else if (regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
                         potential_kernel_NX_fault = TRUE;
                 }
+       } else if (pmap_smap_enabled &&
+                  regs->isf.trapno == T_PAGE_FAULT &&
+                  regs->isf.err & T_PF_PROT &&
+                  regs->cr2 < VM_MAX_USER_PAGE_ADDRESS &&
+                  regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
+               potential_smap_fault = TRUE;
         }
  
  #undef panic
@@ -848,7 +866,7 @@ panic_trap(x86_saved_state64_t *regs)
               virtualized ? " VMM" : "",
               potential_kernel_NX_fault ? " Kernel NX fault" : "",
               potential_smep_fault ? " SMEP/User NX fault" : "",
-             "");
+             potential_smap_fault ? " SMAP fault" : "");
         /*
          * This next statement is not executed,
          * but it's needed to stop the compiler using tail call optimization
diff --git a/osfmk/i386/trap.h b/osfmk/i386/trap.h

index 5eed5e2d12aeeff5aa1982dfe5abae8817a96ed7..619f87eaf4741efa2d951218596e696c995f09f6 100644 (file)
--- a/osfmk/i386/trap.h
+++ b/osfmk/i386/trap.h
@@ -151,6 +151,12 @@ extern volatile perfASTCallback perfASTHook;
  extern volatile perfCallback perfIntHook;
  
  extern void            panic_i386_backtrace(void *, int, const char *, boolean_t, x86_saved_state_t *);
+extern void    print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker,     boolean_t is_64_bit, boolean_t nvram_format);
+extern void    print_tasks_user_threads(task_t task);
+extern void    print_threads_registers(thread_t thread);
+extern void    print_uuid_info(task_t task);
+extern void    print_launchd_info(void);
+
  #if MACH_KDP
  extern boolean_t       kdp_i386_trap(
                                 unsigned int,
diff --git a/osfmk/ipc/mach_debug.c b/osfmk/ipc/mach_debug.c

index 67c303afeae526ab61f567e5bcd19116dea4aaf5..cc0acd9127d0156109b7a4adbb56149dc760a5c7 100644 (file)
--- a/osfmk/ipc/mach_debug.c
+++ b/osfmk/ipc/mach_debug.c
@@ -480,6 +480,7 @@ mach_port_kobject(
         kaddr = (mach_vm_address_t)port->ip_kobject;
         ip_unlock(port);
  
+
         if (0 != kaddr && is_ipc_kobject(*typep))
                 *addrp = VM_KERNEL_UNSLIDE_OR_PERM(kaddr);
         else
diff --git a/osfmk/kern/bsd_kern.c b/osfmk/kern/bsd_kern.c

index 3fc7a40a71a78c040efad7800e6e88e11da93c89..c2edb90698487d9979142a53aad9160541baf997 100644 (file)
--- a/osfmk/kern/bsd_kern.c
+++ b/osfmk/kern/bsd_kern.c
@@ -49,6 +49,7 @@
  /* BSD KERN COMPONENT INTERFACE */
  
  task_t bsd_init_task = TASK_NULL;
+boolean_t init_task_died;
  char   init_task_failure_data[1024];
  extern unsigned int not_in_kdp; /* Skip acquiring locks if we're in kdp */
   
diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c

index 6dc10f748c70426f062e09e73960e8cadcdeb328..6f527a66fdb67e6c44fdbb5ff665e28b3d651177 100644 (file)
--- a/osfmk/kern/debug.c
+++ b/osfmk/kern/debug.c
@@ -101,6 +101,7 @@ unsigned int        disable_debug_output = TRUE;
  unsigned int   systemLogDiags = FALSE;
  unsigned int   panicDebugging = FALSE;
  unsigned int   logPanicDataToScreen = FALSE;
+unsigned int   kdebug_serial = FALSE;
  
  int mach_assert = 1;
  
@@ -497,7 +498,7 @@ void populate_model_name(char *model_string) {
         strlcpy(model_name, model_string, sizeof(model_name));
  }
  
-static void panic_display_model_name(void) {
+void panic_display_model_name(void) {
         char tmp_model_name[sizeof(model_name)];
  
         if (ml_nofault_copy((vm_offset_t) &model_name, (vm_offset_t) &tmp_model_name, sizeof(model_name)) != sizeof(model_name))
@@ -509,7 +510,7 @@ static void panic_display_model_name(void) {
                 kdb_printf("System model name: %s\n", tmp_model_name);
  }
  
-static void panic_display_kernel_uuid(void) {
+void panic_display_kernel_uuid(void) {
         char tmp_kernel_uuid[sizeof(kernel_uuid_string)];
  
         if (ml_nofault_copy((vm_offset_t) &kernel_uuid_string, (vm_offset_t) &tmp_kernel_uuid, sizeof(kernel_uuid_string)) != sizeof(kernel_uuid_string))
@@ -628,6 +629,8 @@ __private_extern__ void panic_display_ecc_errors()
  #if CONFIG_ZLEAKS
  extern boolean_t       panic_include_ztrace;
  extern struct ztrace* top_ztrace;
+void panic_print_symbol_name(vm_address_t search);
+
  /*
   * Prints the backtrace most suspected of being a leaker, if we paniced in the zone allocator.
   * top_ztrace and panic_include_ztrace comes from osfmk/kern/zalloc.c
@@ -636,6 +639,9 @@ __private_extern__ void panic_display_ztrace(void)
  {
         if(panic_include_ztrace == TRUE) {
                 unsigned int i = 0;
+               boolean_t keepsyms = FALSE;
+
+               PE_parse_boot_argn("keepsyms", &keepsyms, sizeof (keepsyms));
                 struct ztrace top_ztrace_copy;
                 
                 /* Make sure not to trip another panic if there's something wrong with memory */
@@ -643,7 +649,11 @@ __private_extern__ void panic_display_ztrace(void)
                         kdb_printf("\nBacktrace suspected of leaking: (outstanding bytes: %lu)\n", (uintptr_t)top_ztrace_copy.zt_size);
                         /* Print the backtrace addresses */
                         for (i = 0; (i < top_ztrace_copy.zt_depth && i < MAX_ZTRACE_DEPTH) ; i++) {
-                               kdb_printf("%p\n", top_ztrace_copy.zt_stack[i]);
+                               kdb_printf("%p ", top_ztrace_copy.zt_stack[i]);
+                               if (keepsyms) {
+                                       panic_print_symbol_name((vm_address_t)top_ztrace_copy.zt_stack[i]);
+                               }
+                               kdb_printf("\n");
                         }
                         /* Print any kexts in that backtrace, along with their link addresses so we can properly blame them */
                         kmod_panic_dump((vm_offset_t *)&top_ztrace_copy.zt_stack[0], top_ztrace_copy.zt_depth);
diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h

index 85acc47fd98b0a92826ead2d3e1dd415142deb60..407d4b4f28f985133ed8a5b260795b5f80914078 100644 (file)
--- a/osfmk/kern/debug.h
+++ b/osfmk/kern/debug.h
@@ -299,6 +299,7 @@ extern unsigned int         disable_debug_output;
  
  extern unsigned int    panicDebugging;
  extern unsigned int    logPanicDataToScreen;
+extern unsigned int    kdebug_serial;
  
  extern int db_run_mode;
  
@@ -332,6 +333,8 @@ void        panic_display_system_configuration(void);
  void   panic_display_zprint(void);
  void   panic_display_kernel_aslr(void);
  void   panic_display_hibb(void);
+void   panic_display_model_name(void);
+void   panic_display_kernel_uuid(void);
  #if CONFIG_ZLEAKS
  void   panic_display_ztrace(void);
  #endif /* CONFIG_ZLEAKS */
@@ -359,7 +362,8 @@ void        panic_display_ecc_errors(void);
                                                 * post-panic crashdump/paniclog
                                                 * dump.
                                                 */
-#define DB_NMI_BTN_ENA  0x8000 /* Enable button to directly trigger NMI */
+#define DB_NMI_BTN_ENA  0x8000  /* Enable button to directly trigger NMI */
+#define DB_PRT_KDEBUG   0x10000 /* kprintf KDEBUG traces */
  
  #if DEBUG
  /*
diff --git a/osfmk/kern/hv_support.c b/osfmk/kern/hv_support.c

index 9d032d2d0362d8f323e8d758b4164e70aa4214c4..c60df98863f3de3d366d7c55dbc0e0c9150c4ad6 100644 (file)
--- a/osfmk/kern/hv_support.c
+++ b/osfmk/kern/hv_support.c
@@ -44,6 +44,7 @@ int hv_support_available = 0;
  hv_callbacks_t hv_callbacks = {
         .dispatch = NULL,               /* thread is being dispatched for execution */
         .preempt = NULL,                /* thread is being preempted */
+       .suspend = NULL,                /* system is being suspended */
         .thread_destroy = NULL, /* thread is being destroyed */
         .task_destroy = NULL,   /* task is being destroyed */
         .volatile_state = NULL, /* thread state is becoming volatile */
@@ -142,7 +143,7 @@ hv_mp_notify(void) {
                         lck_mtx_unlock(hv_support_lck_mtx);
                         break;
                 } else {
-                       hv_callbacks.memory_pressure(NULL);
+                       hv_callbacks.memory_pressure();
                 }
                 lck_mtx_unlock(hv_support_lck_mtx);
         }
@@ -244,6 +245,7 @@ hv_release_callbacks(void) {
         hv_callbacks = (hv_callbacks_t) {
                 .dispatch = NULL,
                 .preempt = NULL,
+               .suspend = NULL,
                 .thread_destroy = NULL,
                 .task_destroy = NULL,
                 .volatile_state = NULL,
@@ -254,6 +256,14 @@ hv_release_callbacks(void) {
         lck_mtx_unlock(hv_support_lck_mtx);
  }
  
+/* system suspend notification */
+void
+hv_suspend(void) {
+       if (hv_callbacks_enabled) {
+               hv_callbacks.suspend();
+       }
+}
+
  /* dispatch hv_task_trap/hv_thread_trap syscalls to trap handlers,
     fail for invalid index or absence of trap handlers, trap handler is
     responsible for validating targets */
diff --git a/osfmk/kern/hv_support.h b/osfmk/kern/hv_support.h

index 485654f707eb3442d81dd951dffab2c831a626eb..aaedb76ae5e0c604aba4014826cae3493a28d22c 100644 (file)
--- a/osfmk/kern/hv_support.h
+++ b/osfmk/kern/hv_support.h
@@ -45,9 +45,7 @@ typedef enum {
         HV_THREAD_TRAP = 1
  } hv_trap_type_t;
  
-typedef kern_return_t (*hv_trap_t) (void *thread_target, uint64_t arg);
-typedef void (*hv_callback_0_t)(void *target);
-typedef void (*hv_callback_1_t)(void *target, int argument);
+typedef kern_return_t (*hv_trap_t) (void *target, uint64_t arg);
  
  typedef struct  {
         const hv_trap_t *traps;
@@ -55,12 +53,13 @@ typedef struct  {
  } hv_trap_table_t;
  
  typedef struct {
-       hv_callback_0_t dispatch;
-       hv_callback_0_t preempt;
-       hv_callback_0_t thread_destroy;
-       hv_callback_0_t task_destroy;
-       hv_callback_1_t volatile_state;
-       hv_callback_0_t memory_pressure;
+       void (*dispatch)(void *vcpu);
+       void (*preempt)(void *vcpu);
+       void (*suspend)(void);
+       void (*thread_destroy)(void *vcpu);
+       void (*task_destroy)(void *vm);
+       void (*volatile_state)(void *vcpu, int state);
+       void (*memory_pressure)(void);
  } hv_callbacks_t;
  
  extern hv_callbacks_t hv_callbacks;
@@ -79,7 +78,8 @@ extern kern_return_t hv_set_traps(hv_trap_type_t trap_type,
         const hv_trap_t *traps, unsigned trap_count);
  extern void hv_release_traps(hv_trap_type_t trap_type);
  extern kern_return_t hv_set_callbacks(hv_callbacks_t callbacks);
-extern void hv_release_callbacks(void) ;
+extern void hv_release_callbacks(void);
+extern void hv_suspend(void);
  extern kern_return_t hv_task_trap(uint64_t index, uint64_t arg);
  extern kern_return_t hv_thread_trap(uint64_t index, uint64_t arg);
  
diff --git a/osfmk/kern/sfi.c b/osfmk/kern/sfi.c

index 725164c77c311d009a3af2106aaab9e63121e3fb..819c6c686fc583cb0d67d3d2d82763a8c5ba340f 100644 (file)
--- a/osfmk/kern/sfi.c
+++ b/osfmk/kern/sfi.c
@@ -162,6 +162,7 @@ struct sfi_class_state {
         uint64_t        off_time_interval;
  
         timer_call_data_t       on_timer;
+       uint64_t        on_timer_deadline;
         boolean_t                       on_timer_programmed;
  
         boolean_t       class_sfi_is_enabled;
@@ -335,12 +336,15 @@ static void sfi_timer_global_off(
  
                         /* Push out on-timer */
                         on_timer_deadline = now + sfi_classes[i].off_time_interval;
+                       sfi_classes[i].on_timer_deadline = on_timer_deadline;
+
                         timer_call_enter1(&sfi_classes[i].on_timer, NULL, on_timer_deadline, TIMER_CALL_SYS_CRITICAL);
                 } else {
                         /* If this class no longer needs SFI, make sure the timer is cancelled */
                         sfi_classes[i].class_in_on_phase = TRUE;
                         if (sfi_classes[i].on_timer_programmed) {
                                 sfi_classes[i].on_timer_programmed = FALSE;
+                               sfi_classes[i].on_timer_deadline = ~0ULL;
                                 timer_call_cancel(&sfi_classes[i].on_timer);
                         }
                 }
@@ -420,7 +424,10 @@ static void sfi_timer_per_class_on(
          * Since we have the sfi_lock held and have changed "class_in_on_phase", we expect
          * no new threads to be put on this wait queue until the global "off timer" has fired.
          */
+
         sfi_class->class_in_on_phase = TRUE;
+       sfi_class->on_timer_programmed = FALSE;
+
         kret = wait_queue_wakeup64_all(&sfi_class->wait_queue,
                                                                    CAST_EVENT64_T(sfi_class_id),
                                                                    THREAD_AWAKENED);
@@ -532,6 +539,52 @@ kern_return_t sfi_window_cancel(void)
         return (KERN_SUCCESS);
  }
  
+/* Defers SFI off and per-class on timers (if live) by the specified interval
+ * in Mach Absolute Time Units. Currently invoked to align with the global
+ * forced idle mechanism. Making some simplifying assumptions, the iterative GFI
+ * induced SFI on+off deferrals form a geometric series that converges to yield
+ * an effective SFI duty cycle that is scaled by the GFI duty cycle. Initial phase
+ * alignment and congruency of the SFI/GFI periods can distort this to some extent.
+ */
+
+kern_return_t sfi_defer(uint64_t sfi_defer_matus)
+{
+       spl_t           s;
+       kern_return_t kr = KERN_FAILURE;
+       s = splsched();
+
+       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_GLOBAL_DEFER), sfi_defer_matus, 0, 0, 0, 0);
+
+       simple_lock(&sfi_lock);
+       if (!sfi_is_enabled) {
+               goto sfi_defer_done;
+       }
+
+       assert(sfi_next_off_deadline != 0);
+
+       sfi_next_off_deadline += sfi_defer_matus;
+       timer_call_enter1(&sfi_timer_call_entry, NULL, sfi_next_off_deadline, TIMER_CALL_SYS_CRITICAL);
+
+       int i;
+       for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
+               if (sfi_classes[i].class_sfi_is_enabled) {
+                       if (sfi_classes[i].on_timer_programmed) {
+                               uint64_t new_on_deadline = sfi_classes[i].on_timer_deadline + sfi_defer_matus;
+                               sfi_classes[i].on_timer_deadline = new_on_deadline;
+                               timer_call_enter1(&sfi_classes[i].on_timer, NULL, new_on_deadline, TIMER_CALL_SYS_CRITICAL);
+                       }
+               }
+       }
+
+       kr = KERN_SUCCESS;
+sfi_defer_done:
+       simple_unlock(&sfi_lock);
+
+       splx(s);
+
+       return (kr);
+}
+
  
  kern_return_t sfi_get_window(uint64_t *window_usecs)
  {
diff --git a/osfmk/kern/sfi.h b/osfmk/kern/sfi.h

index 385b57cf01e522fcd923c6f7224cceb8954a8fbe..7ac6259b3e6ae96a44d89f3d3bfe30c23d5cac58 100644 (file)
--- a/osfmk/kern/sfi.h
+++ b/osfmk/kern/sfi.h
@@ -64,6 +64,7 @@ ast_t sfi_processor_needs_ast(processor_t processor);
  
  void sfi_ast(thread_t thread);
  void sfi_reevaluate(thread_t thread);
+kern_return_t sfi_defer(uint64_t);
  #endif /* MACH_KERNEL_PRIVATE */
  
  #endif /* _KERN_SFI_H_ */
diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c

index e67c01a8b493e48e08a70ebb61ced5f02fef83af..6ddd0389c8526a9c1b6acf60efab0e16d3310ff3 100644 (file)
--- a/osfmk/kern/startup.c
+++ b/osfmk/kern/startup.c
@@ -192,6 +192,7 @@ unsigned int new_nkdbufs = 0;
  unsigned int wake_nkdbufs = 0;
  unsigned int write_trace_on_panic = 0;
  unsigned int trace_typefilter = 0;
+boolean_t    trace_serial = FALSE;
  
  /* mach leak logging */
  int log_leaks = 0;
@@ -480,6 +481,11 @@ kernel_bootstrap_thread(void)
  #endif
  
  #if (defined(__i386__) || defined(__x86_64__))
+       if (kdebug_serial) {
+               new_nkdbufs = 1;
+               if (trace_typefilter == 0)
+                       trace_typefilter = 1;
+       }
         if (turn_on_log_leaks && !new_nkdbufs)
                 new_nkdbufs = 200000;
         if (trace_typefilter)
diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c

index b9a7ae0ebe5a749c2d0e7902d507a0389069eadd..8b7d863b7a8e08f0ef4c867b4ce38e567009775d 100644 (file)
--- a/osfmk/kern/thread.c
+++ b/osfmk/kern/thread.c
@@ -309,6 +309,12 @@ thread_bootstrap(void)
  #endif /* HYPERVISOR */
  
         thread_template.t_chud = 0;
+
+#if (DEVELOPMENT || DEBUG)
+       thread_template.t_page_creation_throttled_hard = 0;
+       thread_template.t_page_creation_throttled_soft = 0;
+#endif /* DEVELOPMENT || DEBUG */
+       thread_template.t_page_creation_throttled = 0;
         thread_template.t_page_creation_count = 0;
         thread_template.t_page_creation_time = 0;
  
@@ -663,7 +669,7 @@ void
  thread_terminate_enqueue(
         thread_t                thread)
  {
-       KERNEL_DEBUG_CONSTANT(TRACEDBG_CODE(DBG_TRACE_DATA, TRACE_DATA_THREAD_TERMINATE) | DBG_FUNC_NONE, thread->thread_id, 0, 0, 0, 0);
+       KERNEL_DEBUG_CONSTANT(TRACE_DATA_THREAD_TERMINATE | DBG_FUNC_NONE, thread->thread_id, 0, 0, 0, 0);
  
         simple_lock(&thread_terminate_lock);
         enqueue_tail(&thread_terminate_queue, (queue_entry_t)thread);
diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h

index a81bc6c8ddf44c2e6b13edb17aa739798edae589..0b5061a33a19a1536d4ece308967f3939c794055 100644 (file)
--- a/osfmk/kern/thread.h
+++ b/osfmk/kern/thread.h
@@ -411,6 +411,12 @@ struct thread {
                 clock_sec_t t_page_creation_time;
                 uint32_t    t_page_creation_count;
  
+       uint32_t    t_page_creation_throttled;
+#if (DEVELOPMENT || DEBUG)
+       uint64_t    t_page_creation_throttled_hard;
+       uint64_t    t_page_creation_throttled_soft;
+#endif /* DEVELOPMENT || DEBUG */
+
  #define T_CHUD_MARKED           0x01          /* this thread is marked by CHUD */
  #define T_IN_CHUD               0x02          /* this thread is already in a CHUD handler */
  #define THREAD_PMC_FLAG         0x04          /* Bit in "t_chud" signifying PMC interest */    
diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h

index 7d3a98630b2b3186e37fa34c58c87979dd36b128..abd9115623dc8e2253ab069a1d59e9306c4a2c7c 100644 (file)
--- a/osfmk/mach/machine.h
+++ b/osfmk/mach/machine.h
@@ -393,9 +393,7 @@ __END_DECLS
  #define CPUFAMILY_INTEL_SANDYBRIDGE    0x5490b78c
  #define CPUFAMILY_INTEL_IVYBRIDGE      0x1f65e835
  #define CPUFAMILY_INTEL_HASWELL                0x10b282dc
-#if !defined(XNU_HIDE_SEED)
  #define CPUFAMILY_INTEL_BROADWELL      0x582ed09c
-#endif /* not XNU_HIDE_SEED */
  #define CPUFAMILY_ARM_9                        0xe73283ae
  #define CPUFAMILY_ARM_11               0x8ff620d8
  #define CPUFAMILY_ARM_XSCALE           0x53b005f5
diff --git a/osfmk/vm/vm_compressor.c b/osfmk/vm/vm_compressor.c

index 402323060ef29451c88902e2580bf64befda16a2..22485828281f6821bd7be4fcecb1fcf08ff6ccf5 100644 (file)
--- a/osfmk/vm/vm_compressor.c
+++ b/osfmk/vm/vm_compressor.c
@@ -60,6 +60,7 @@ int           vm_compressor_mode = VM_PAGER_COMPRESSOR_WITH_SWAP;
  int            vm_scale = 16;
  
  
+int            vm_compressor_is_active = 0;
  int            vm_compression_limit = 0;
  
  extern boolean_t vm_swap_up;
@@ -464,6 +465,9 @@ vm_compressor_init(void)
                 vm_compressor_swap_init();
         }
  
+       if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED)
+               vm_compressor_is_active = 1;
+
  #if CONFIG_FREEZE
         memorystatus_freeze_enabled = TRUE;
  #endif /* CONFIG_FREEZE */
@@ -764,9 +768,9 @@ void
  c_seg_free_locked(c_segment_t c_seg)
  {
         int             segno, i;
-       int             pages_populated;
+       int             pages_populated = 0;
         int32_t         *c_buffer = NULL;
-       uint64_t        c_swap_handle;
+       uint64_t        c_swap_handle = 0;
  
         assert(!c_seg->c_on_minorcompact_q);
  
@@ -1017,9 +1021,7 @@ struct {
  } c_seg_major_compact_stats;
  
  
-#define        C_MAJOR_COMPACTION_AGE_APPROPRIATE      30
-#define C_MAJOR_COMPACTION_OLD_ENOUGH          300
-#define C_MAJOR_COMPACTION_SIZE_APPROPRIATE    ((C_SEG_BUFSIZE * 80) / 100)
+#define C_MAJOR_COMPACTION_SIZE_APPROPRIATE    ((C_SEG_BUFSIZE * 90) / 100)
  
  
  boolean_t
@@ -2398,7 +2400,7 @@ static int
  c_compress_page(char *src, c_slot_mapping_t slot_ptr, c_segment_t *current_chead, char *scratch_buf)
  {
         int             c_size;
-       int             c_rounded_size;
+       int             c_rounded_size = 0;
         int             max_csize;
         c_slot_t        cs;
         c_segment_t     c_seg;
diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c

index e39ebf9b19bc8cc73cc0747a9a6b1d988e8f1d39..bb506ceddc0a48c4366688a158059f571d7e20e3 100644 (file)
--- a/osfmk/vm/vm_fault.c
+++ b/osfmk/vm/vm_fault.c
@@ -135,11 +135,15 @@ uint64_t vm_hard_throttle_threshold;
  
  #define NEED_TO_HARD_THROTTLE_THIS_TASK()      (vm_wants_task_throttled(current_task()) ||     \
                                                  (vm_page_free_count < vm_page_throttle_limit && \
-                                                 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
+                                                 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) > THROTTLE_LEVEL_THROTTLED))
  
  
-#define HARD_THROTTLE_DELAY    20000   /* 20000 us == 20 ms */
-#define SOFT_THROTTLE_DELAY    2000    /* 2000 us == 2 ms */
+#define HARD_THROTTLE_DELAY    5000    /* 5000 us == 5 ms */
+#define SOFT_THROTTLE_DELAY    200     /* 200 us == .2 ms */
+
+#define        VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
+#define        VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
+
  
  boolean_t current_thread_aborted(void);
  
@@ -544,8 +548,13 @@ vm_fault_deactivate_behind(
  }
  
  
+#if (DEVELOPMENT || DEBUG)
+uint32_t       vm_page_creation_throttled_hard = 0;
+uint32_t       vm_page_creation_throttled_soft = 0;
+#endif /* DEVELOPMENT || DEBUG */
+
  static int
-vm_page_throttled(void)
+vm_page_throttled(boolean_t page_kept)
  {
          clock_sec_t     elapsed_sec;
          clock_sec_t     tv_sec;
@@ -556,21 +565,31 @@ vm_page_throttled(void)
         if (thread->options & TH_OPT_VMPRIV)
                 return (0);
  
-       thread->t_page_creation_count++;
-
-       if (NEED_TO_HARD_THROTTLE_THIS_TASK())
+       if (thread->t_page_creation_throttled) {
+               thread->t_page_creation_throttled = 0;
+               
+               if (page_kept == FALSE)
+                       goto no_throttle;
+       }
+       if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
+#if (DEVELOPMENT || DEBUG)
+               thread->t_page_creation_throttled_hard++;
+               OSAddAtomic(1, &vm_page_creation_throttled_hard);
+#endif /* DEVELOPMENT || DEBUG */
                 return (HARD_THROTTLE_DELAY);
+       }
  
         if ((vm_page_free_count < vm_page_throttle_limit || ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
-           thread->t_page_creation_count > vm_page_creation_throttle) {
+           thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
                 
                 clock_get_system_microtime(&tv_sec, &tv_usec);
  
                 elapsed_sec = tv_sec - thread->t_page_creation_time;
  
-               if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
+               if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
+                   (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
  
-                       if (elapsed_sec >= 60) {
+                       if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
                                 /*
                                  * we'll reset our stats to give a well behaved app
                                  * that was unlucky enough to accumulate a bunch of pages
@@ -581,22 +600,35 @@ vm_page_throttled(void)
                                  * will remain in the throttled state
                                  */
                                 thread->t_page_creation_time = tv_sec;
-                               thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
+                               thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
                         }
                         ++vm_page_throttle_count;
  
-                       if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED())
+                       thread->t_page_creation_throttled = 1;
+
+                       if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED()) {
+#if (DEVELOPMENT || DEBUG)
+                               thread->t_page_creation_throttled_hard++;
+                               OSAddAtomic(1, &vm_page_creation_throttled_hard);
+#endif /* DEVELOPMENT || DEBUG */
                                 return (HARD_THROTTLE_DELAY);
-                       else
+                       } else {
+#if (DEVELOPMENT || DEBUG)
+                               thread->t_page_creation_throttled_soft++;
+                               OSAddAtomic(1, &vm_page_creation_throttled_soft);
+#endif /* DEVELOPMENT || DEBUG */
                                 return (SOFT_THROTTLE_DELAY);
+                       }
                 }
                 thread->t_page_creation_time = tv_sec;
                 thread->t_page_creation_count = 0;
         }
+no_throttle:
+       thread->t_page_creation_count++;
+
         return (0);
  }
  
-
  /*
   * check for various conditions that would
   * prevent us from creating a ZF page...
@@ -606,7 +638,7 @@ vm_page_throttled(void)
   * object == m->object
   */
  static vm_fault_return_t
-vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
+vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state, boolean_t page_throttle)
  {
         int throttle_delay;
  
@@ -647,7 +679,7 @@ vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t int
                         return (VM_FAULT_RETRY);
                 }
         }
-       if ((throttle_delay = vm_page_throttled())) {
+       if (page_throttle == TRUE && (throttle_delay = vm_page_throttled(FALSE))) {
                 /*
                  * we're throttling zero-fills...
                  * treat this as if we couldn't grab a page
@@ -1150,7 +1182,7 @@ vm_fault_page(
                                          * fault cleanup in the case of an error condition
                                          * including resetting the thread_interrupt_level
                                          */
-                                       error = vm_fault_check(object, m, first_m, interruptible_state);
+                                       error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
  
                                         if (error != VM_FAULT_SUCCESS)
                                                 return (error);
@@ -1560,6 +1592,21 @@ vm_fault_page(
                                         0,
                                         &compressed_count_delta);
  
+                               if (type_of_fault == NULL) {
+                                       int     throttle_delay;
+
+                                       /*
+                                        * we weren't called from vm_fault, so we
+                                        * need to apply page creation throttling
+                                        * do it before we re-acquire any locks
+                                        */
+                                       if (my_fault_type == DBG_COMPRESSOR_FAULT) {
+                                               if ((throttle_delay = vm_page_throttled(TRUE))) {
+                                                       VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
+                                                       delay(throttle_delay);
+                                               }
+                                       }
+                               }
                                 vm_object_lock(object);
                                 assert(object->paging_in_progress > 0);
  
@@ -1856,7 +1903,7 @@ dont_look_for_page:
                          * fault cleanup in the case of an error condition
                          * including resetting the thread_interrupt_level
                          */
-                       error = vm_fault_check(object, m, first_m, interruptible_state);
+                       error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
  
                         if (error != VM_FAULT_SUCCESS)
                                 return (error);
@@ -3885,31 +3932,6 @@ FastPmapEnter:
                          */
                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
  
-                       if ((throttle_delay = vm_page_throttled())) {
-                               /*
-                                * drop all of our locks...
-                                * wait until the free queue is
-                                * pumped back up and then
-                                * redrive the fault
-                                */
-                               if (object != cur_object)
-                                       vm_object_unlock(cur_object);
-                               vm_object_unlock(object);
-                               vm_map_unlock_read(map);
-                               if (real_map != map)
-                                       vm_map_unlock(real_map);
-
-                               VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
-
-                               delay(throttle_delay);
-
-                               if (!current_thread_aborted() && vm_page_wait((change_wiring) ? 
-                                                THREAD_UNINT :
-                                                THREAD_ABORTSAFE))
-                                       goto RetryFault;
-                               kr = KERN_ABORTED;
-                               goto done;
-                       }
                          /*
                          * If objects match, then
                          * object->copy must not be NULL (else control
@@ -4268,31 +4290,6 @@ FastPmapEnter:
                                         kr = KERN_MEMORY_ERROR;
                                         goto done;
                                 }
-                               if ((throttle_delay = vm_page_throttled())) {
-                                       /*
-                                        * drop all of our locks...
-                                        * wait until the free queue is
-                                        * pumped back up and then
-                                        * redrive the fault
-                                        */
-                                       if (object != cur_object)
-                                               vm_object_unlock(cur_object);
-                                       vm_object_unlock(object);
-                                       vm_map_unlock_read(map);
-                                       if (real_map != map)
-                                               vm_map_unlock(real_map);
-
-                                       VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
-
-                                       delay(throttle_delay);
-
-                                       if (!current_thread_aborted() && vm_page_wait((change_wiring) ? 
-                                                        THREAD_UNINT :
-                                                        THREAD_ABORTSAFE))
-                                               goto RetryFault;
-                                       kr = KERN_ABORTED;
-                                       goto done;
-                               }
                                 if (vm_backing_store_low) {
                                         /*
                                          * we are protecting the system from
@@ -4829,12 +4826,27 @@ done:
         thread_interrupt_level(interruptible_state);
  
         /*
-        * Only throttle on faults which cause a pagein.
+        * Only I/O throttle on faults which cause a pagein/swapin.
          */
         if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
                 throttle_lowpri_io(1);
-       }
+       } else {
+               if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
  
+                       if ((throttle_delay = vm_page_throttled(TRUE))) {
+
+                               if (vm_debug_events) {
+                                       if (type_of_fault == DBG_COMPRESSOR_FAULT)
+                                               VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+                                       else if (type_of_fault == DBG_COW_FAULT)
+                                               VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+                                       else
+                                               VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
+                               }
+                               delay(throttle_delay);
+                       }
+               }
+       }
         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
                               (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
                               ((uint64_t)vaddr >> 32),
diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c

index a22b763c17218f25395b275c9bdd1d3d7f1dcf6f..ca11e1bae872bddae4311b2d0dedf3719ecfb5c3 100644 (file)
--- a/osfmk/vm/vm_map.c
+++ b/osfmk/vm/vm_map.c
@@ -6199,6 +6199,7 @@ vm_map_copy_copy(
          */
  
         new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+       new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
         *new_copy = *copy;
  
         if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
@@ -6847,6 +6848,7 @@ start_overwrite:
                                 /* destroyed after successful copy_overwrite */
                                 copy = (vm_map_copy_t) 
                                         zalloc(vm_map_copy_zone);
+                               copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
                                 vm_map_copy_first_entry(copy) =
                                         vm_map_copy_last_entry(copy) =
                                         vm_map_copy_to_entry(copy);
@@ -7150,6 +7152,7 @@ vm_map_copy_overwrite(
                  * Extract "head_copy" out of "copy".
                  */
                 head_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+               head_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
                 vm_map_copy_first_entry(head_copy) =
                         vm_map_copy_to_entry(head_copy);
                 vm_map_copy_last_entry(head_copy) =
@@ -7191,6 +7194,7 @@ vm_map_copy_overwrite(
                  * Extract "tail_copy" out of "copy".
                  */
                 tail_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+               tail_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
                 vm_map_copy_first_entry(tail_copy) =
                         vm_map_copy_to_entry(tail_copy);
                 vm_map_copy_last_entry(tail_copy) =
@@ -8657,6 +8661,7 @@ vm_map_copyin_common(
          */
  
         copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+       copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
         vm_map_copy_first_entry(copy) =
                 vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy);
         copy->type = VM_MAP_COPY_ENTRY_LIST;
@@ -9392,6 +9397,7 @@ vm_map_copy_extract(
          */
  
         copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+       copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
         vm_map_copy_first_entry(copy) =
                 vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy);
         copy->type = VM_MAP_COPY_ENTRY_LIST;
@@ -9443,6 +9449,7 @@ vm_map_copyin_object(
          */
  
         copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
+       copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
         copy->type = VM_MAP_COPY_OBJECT;
         copy->cpy_object = object;
         copy->offset = offset;
diff --git a/osfmk/vm/vm_map_store.c b/osfmk/vm/vm_map_store.c

index 3a2b381f0024ddef20e90bb89ad89c08d3719bd1..288bafba1d7c4e1818a68e369d9bec6712106dd7 100644 (file)
--- a/osfmk/vm/vm_map_store.c
+++ b/osfmk/vm/vm_map_store.c
@@ -36,12 +36,23 @@ first_free_is_valid_store( vm_map_t map )
  }
  #endif
  
+boolean_t
+vm_map_store_has_RB_support( struct vm_map_header *hdr )
+{
+       if ((void*)hdr->rb_head_store.rbh_root == (void*)(int)SKIP_RB_TREE) {
+               return FALSE;
+       }
+       return TRUE;
+}
+
  void
  vm_map_store_init( struct vm_map_header *hdr )
  {
         vm_map_store_init_ll( hdr );
  #ifdef VM_MAP_STORE_USE_RB
-       vm_map_store_init_rb( hdr );
+       if (vm_map_store_has_RB_support( hdr )) {
+               vm_map_store_init_rb( hdr );
+       }
  #endif
  }
  
@@ -54,7 +65,12 @@ vm_map_store_lookup_entry(
  #ifdef VM_MAP_STORE_USE_LL
         return (vm_map_store_lookup_entry_ll( map, address, entry ));
  #elif defined VM_MAP_STORE_USE_RB
-       return (vm_map_store_lookup_entry_rb( map, address, entry ));
+       if (vm_map_store_has_RB_support( &map->hdr )) {
+               return (vm_map_store_lookup_entry_rb( map, address, entry ));
+       } else {
+               panic("VM map lookups need RB tree support.\n");
+               return FALSE; /* For compiler warning.*/
+       }
  #endif
  }
  
@@ -81,7 +97,9 @@ void  vm_map_store_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_
  {
         vm_map_store_copy_insert_ll(map, after_where, copy);
  #ifdef VM_MAP_STORE_USE_RB
-       vm_map_store_copy_insert_rb(map, after_where, copy);
+       if (vm_map_store_has_RB_support( &map->hdr )) {
+               vm_map_store_copy_insert_rb(map, after_where, copy);
+       }
  #endif
  }
  
@@ -104,7 +122,9 @@ _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_wh
         assert(entry->vme_start < entry->vme_end);
         vm_map_store_entry_link_ll(mapHdr, after_where, entry);
  #ifdef VM_MAP_STORE_USE_RB
-       vm_map_store_entry_link_rb(mapHdr, after_where, entry);
+       if (vm_map_store_has_RB_support( mapHdr )) {
+               vm_map_store_entry_link_rb(mapHdr, after_where, entry);
+       }
  #endif
  #if MAP_ENTRY_INSERTION_DEBUG
         fastbacktrace(&entry->vme_insertion_bt[0],
@@ -126,7 +146,9 @@ vm_map_store_entry_link( vm_map_t map, vm_map_entry_t after_where, vm_map_entry_
         } else {
                 update_first_free_ll(VMEL_map, VMEL_map->first_free);
  #ifdef VM_MAP_STORE_USE_RB
-               update_first_free_rb(VMEL_map, VMEL_map->first_free);
+               if (vm_map_store_has_RB_support( &VMEL_map->hdr )) {
+                       update_first_free_rb(VMEL_map, VMEL_map->first_free);
+               }
  #endif
         }
  }
@@ -136,7 +158,9 @@ _vm_map_store_entry_unlink( struct vm_map_header * mapHdr, vm_map_entry_t entry)
  {
         vm_map_store_entry_unlink_ll(mapHdr, entry);
  #ifdef VM_MAP_STORE_USE_RB
-       vm_map_store_entry_unlink_rb(mapHdr, entry);
+       if (vm_map_store_has_RB_support( mapHdr )) {
+               vm_map_store_entry_unlink_rb(mapHdr, entry);
+       }
  #endif
  }
  
@@ -158,7 +182,9 @@ vm_map_store_entry_unlink( vm_map_t map, vm_map_entry_t entry)
         vm_map_store_update( map, entry, VM_MAP_ENTRY_DELETE);
         update_first_free_ll(VMEU_map, VMEU_first_free);
  #ifdef VM_MAP_STORE_USE_RB
-       update_first_free_rb(VMEU_map, VMEU_first_free);
+       if (vm_map_store_has_RB_support( &VMEU_map->hdr )) {
+               update_first_free_rb(VMEU_map, VMEU_first_free);
+       }
  #endif
  }
  
@@ -168,7 +194,9 @@ vm_map_store_copy_reset( vm_map_copy_t copy,vm_map_entry_t entry)
         int nentries = copy->cpy_hdr.nentries;
         vm_map_store_copy_reset_ll(copy, entry, nentries);
  #ifdef VM_MAP_STORE_USE_RB
-       vm_map_store_copy_reset_rb(copy, entry, nentries);
+       if (vm_map_store_has_RB_support( &copy->c_u.hdr )) {
+               vm_map_store_copy_reset_rb(copy, entry, nentries);
+       }
  #endif
  }
  
@@ -177,6 +205,8 @@ vm_map_store_update_first_free( vm_map_t map, vm_map_entry_t first_free)
  {
         update_first_free_ll(map, first_free);
  #ifdef VM_MAP_STORE_USE_RB
-       update_first_free_rb(map, first_free);
+       if (vm_map_store_has_RB_support( &map->hdr )) {
+               update_first_free_rb(map, first_free);
+       }
  #endif
  }
diff --git a/osfmk/vm/vm_map_store.h b/osfmk/vm/vm_map_store.h

index dab7746ede3cf4ab1af84fa46fe85ffea6825c1c..b6c12fe19dcf261b9cb2ffe33b42a5f0be8eec6b 100644 (file)
--- a/osfmk/vm/vm_map_store.h
+++ b/osfmk/vm/vm_map_store.h
@@ -114,6 +114,8 @@ struct vm_map_store {
         (map)->hint = (value);         \
         MACRO_END
  
+#define SKIP_RB_TREE           0xBAADC0D1
+
  #define VM_MAP_ENTRY_CREATE    1
  #define VM_MAP_ENTRY_DELETE    2
  
@@ -130,6 +132,7 @@ void        vm_map_store_copy_reset( struct vm_map_copy*, struct vm_map_entry*);
  #if MACH_ASSERT
  boolean_t first_free_is_valid_store( struct _vm_map*);
  #endif
+boolean_t vm_map_store_has_RB_support( struct vm_map_header *hdr );
  
  #endif /* _VM_VM_MAP_STORE_H */
  
diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c

index 7d78f66b28722d43bc77aea71a2060c61b341eb4..080ffb5e732cadf5cbecaae6a6ae64e12d7dbfe2 100644 (file)
--- a/osfmk/vm/vm_pageout.c
+++ b/osfmk/vm/vm_pageout.c
@@ -1202,7 +1202,6 @@ struct flow_control {
  uint32_t vm_pageout_considered_page = 0;
  uint32_t vm_page_filecache_min = 0;
  
-#define        VM_PAGE_FILECACHE_MIN   50000
  #define ANONS_GRABBED_LIMIT    2
  
  /*
@@ -1664,6 +1663,16 @@ return_from_scan:
                 if  (cache_evict_throttle)
                         cache_evict_throttle--;
  
+               /*
+                * don't let the filecache_min fall below 33% of available memory...
+                *
+                * on systems w/o the compressor/swapper, the filecache is always
+                * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
+                * since most (if not all) of the anonymous pages are in the
+                * throttled queue (which isn't counted as available) which
+                * effectively disables this filter
+                */
+               vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 3);
  
                 exceeded_burst_throttle = FALSE;
                 /*
@@ -1961,6 +1970,15 @@ consider_inactive:
                                         page_prev_state = PAGE_STATE_INACTIVE;
                                         anons_grabbed = 0;
  
+                                       if (vm_page_pageable_external_count < vm_page_filecache_min) {
+                                               if ((++reactivated_this_call % 100))
+                                                       goto must_activate_page;
+                                               /*
+                                                * steal 1% of the file backed pages even if
+                                                * we are under the limit that has been set
+                                                * for a healthy filecache
+                                                */
+                                       }
                                         break;
                                 }
                         }
@@ -2407,6 +2425,7 @@ reactivate_page:
                                         vm_page_deactivate(m);
                                         vm_pageout_inactive_deactivated++;
                                 } else {
+must_activate_page:
                                         /*
                                          * The page was/is being used, so put back on active list.
                                          */
@@ -2767,7 +2786,6 @@ vm_page_free_reserve(
                 vm_page_free_target = vm_page_free_min + 5;
  
         vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 3);
-       vm_page_creation_throttle = vm_page_free_target * 3;
  }
  
  /*
@@ -3763,11 +3781,6 @@ void     vm_pageout_reinit_tuneables(void);
  void
  vm_pageout_reinit_tuneables(void)
  {
-       vm_page_filecache_min = (uint32_t) (max_mem / PAGE_SIZE) / 15;
-
-       if (vm_page_filecache_min < VM_PAGE_FILECACHE_MIN)
-               vm_page_filecache_min = VM_PAGE_FILECACHE_MIN;
-
         vm_compressor_minorcompact_threshold_divisor = 18;
         vm_compressor_majorcompact_threshold_divisor = 22;
         vm_compressor_unthrottle_threshold_divisor = 32;
@@ -3847,12 +3860,6 @@ vm_pageout(void)
         if (vm_pageout_burst_inactive_throttle == 0)
                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
  
-#if !CONFIG_JETSAM
-       vm_page_filecache_min = (uint32_t) (max_mem / PAGE_SIZE) / 20;
-       if (vm_page_filecache_min < VM_PAGE_FILECACHE_MIN)
-               vm_page_filecache_min = VM_PAGE_FILECACHE_MIN;
-#endif
-
         /*
          * Set kernel task to low backing store privileged 
          * status
@@ -4314,11 +4321,10 @@ upl_set_decmp_info(upl_t upl, upl_t src_upl)
          }
          src_upl->decmp_io_upl = (void *)upl;
          src_upl->ref_count++;
-       upl_unlock(src_upl);
  
          upl->flags |= UPL_DECMP_REAL_IO;
          upl->decmp_io_upl = (void *)src_upl;
-
+       upl_unlock(src_upl);
  }
  #endif /* CONFIG_IOSCHED */  
  
diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c

index 513877c18947d4448c8670667d8608c6e64b8be4..9d81f507048eaf5d07f07b0a39adc7c19e42a4e1 100644 (file)
--- a/osfmk/vm/vm_resident.c
+++ b/osfmk/vm/vm_resident.c
@@ -360,7 +360,6 @@ ppnum_t             max_valid_low_ppnum = 0xffffffff;
  unsigned int   vm_page_free_target = 0;
  unsigned int   vm_page_free_min = 0;
  unsigned int   vm_page_throttle_limit = 0;
-uint32_t       vm_page_creation_throttle = 0;
  unsigned int   vm_page_inactive_target = 0;
  unsigned int   vm_page_anonymous_min = 0;
  unsigned int   vm_page_inactive_min = 0;
@@ -5122,7 +5121,6 @@ hibernate_flush_queue(queue_head_t *q, int qcount)
  
                                         goto reenter_pg_on_q;
                                 }
-                               vm_pageout_scan_wants_object = m_object;
  
                                 vm_page_unlock_queues();
                                 mutex_pause(try_failed_count++);
@@ -5132,7 +5130,6 @@ hibernate_flush_queue(queue_head_t *q, int qcount)
                                 continue;
                         } else {
                                 l_object = m_object;
-                               vm_pageout_scan_wants_object = VM_OBJECT_NULL;
                         }
                 }
                 if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error) {
@@ -5198,7 +5195,6 @@ hibernate_flush_queue(queue_head_t *q, int qcount)
                                 vm_object_unlock(l_object);
                                 l_object = NULL;
                         }
-                       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
  
                         while (retval == 0) {
  
@@ -5271,7 +5267,6 @@ next_pg:
                 vm_object_unlock(l_object);
                 l_object = NULL;
         }
-       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
  
         vm_page_unlock_queues();
  
diff --git a/osfmk/x86_64/copyio.c b/osfmk/x86_64/copyio.c

index 6411418061d88e562eae15aeee01d98e3d2bc7a5..66a4dd7ac8e86b49671827af5c931aa60269feeb 100644 (file)
--- a/osfmk/x86_64/copyio.c
+++ b/osfmk/x86_64/copyio.c
@@ -74,6 +74,60 @@ extern int _bcopystr(const void *, void *, vm_size_t, vm_size_t *);
  #define COPYINPHYS     3       /* from user virtual to kernel physical */
  #define COPYOUTPHYS    4       /* from kernel physical to user virtual */
  
+#if DEVELOPMENT
+typedef struct {
+       uint64_t        timestamp;
+       thread_t        thread;
+       uintptr_t       cr4;
+       uint8_t         cpuid;
+       uint8_t         smap_state;
+       uint8_t         copyio_active;
+} smaplog_entry_t;
+
+#define SMAPLOG_BUFFER_SIZE (50)
+static smaplog_entry_t smaplog_cbuf[SMAPLOG_BUFFER_SIZE];
+static uint32_t                smaplog_head = 0;
+
+static void
+smaplog_add_entry(boolean_t enabling)
+{
+       uint32_t index = 0;
+       thread_t thread = current_thread();
+
+       do {
+               index = smaplog_head;
+       } while (!OSCompareAndSwap(index, (index + 1) % SMAPLOG_BUFFER_SIZE, &smaplog_head));
+
+       assert(index < SMAPLOG_BUFFER_SIZE);
+       assert(smaplog_head < SMAPLOG_BUFFER_SIZE);
+       assert(thread);
+
+       smaplog_cbuf[index].timestamp = mach_absolute_time();
+       smaplog_cbuf[index].thread = thread;
+       smaplog_cbuf[index].cpuid = cpu_number();
+       smaplog_cbuf[index].cr4 = get_cr4();
+       smaplog_cbuf[index].smap_state = enabling;
+       smaplog_cbuf[index].copyio_active = (thread->machine.specFlags & CopyIOActive) ? 1 : 0;
+}
+#endif /* DEVELOPMENT */
+
+extern boolean_t pmap_smap_enabled;
+static inline void user_access_enable(void) {
+       if (pmap_smap_enabled) {
+               stac();
+#if DEVELOPMENT
+               smaplog_add_entry(TRUE);
+#endif
+       }
+}
+static inline void user_access_disable(void) {
+       if (pmap_smap_enabled) {
+               clac();
+#if DEVELOPMENT
+               smaplog_add_entry(FALSE);
+#endif
+       }
+}
  
  static int
  copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
@@ -123,6 +177,7 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
          */
         recursive_CopyIOActive = thread->machine.specFlags & CopyIOActive;
         thread->machine.specFlags |= CopyIOActive;
+       user_access_enable();
         if (no_shared_cr3) {
                 istate = ml_set_interrupts_enabled(FALSE);
                 if (get_cr3_base() != pmap->pm_cr3)
@@ -211,6 +266,7 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
                 break;
         }
  
+       user_access_disable();
         if (!recursive_CopyIOActive) {
                 thread->machine.specFlags &= ~CopyIOActive;
         }
diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c

index 8e1b559025c4e5ef95448b0070906cd573ad0d51..51b1b3348485e964ae2c34acffa21e73e9a5787a 100644 (file)
--- a/osfmk/x86_64/pmap.c
+++ b/osfmk/x86_64/pmap.c
@@ -320,6 +320,13 @@ pmap_cpu_init(void)
                         pmap_smep_enabled = TRUE;
                 }
         }
+       if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
+               boolean_t nsmap;
+               if (!PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
+                       set_cr4(get_cr4() | CR4_SMAP);
+                       pmap_smap_enabled = TRUE;
+               }
+       }
  
         if (cdp->cpu_fixed_pmcs_enabled) {
                 boolean_t enable = TRUE;
@@ -448,6 +455,8 @@ pmap_bootstrap(
  
         if (pmap_smep_enabled)
                 printf("PMAP: Supervisor Mode Execute Protection enabled\n");
+       if (pmap_smap_enabled)
+               printf("PMAP: Supervisor Mode Access Protection enabled\n");
  
  #if    DEBUG
         printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
diff --git a/pexpert/pexpert/i386/boot.h b/pexpert/pexpert/i386/boot.h

index d79e4994a2b4fbbceecfa9e766865fd26ebf764b..ad6430635027355308985d39290fdd9cf27feb40 100644 (file)
--- a/pexpert/pexpert/i386/boot.h
+++ b/pexpert/pexpert/i386/boot.h
@@ -175,7 +175,8 @@ typedef struct boot_args {
      uint32_t    pciConfigSpaceEndBusNumber;
      uint32_t   csrActiveConfig;
      uint32_t   csrPendingConfig;
-    uint32_t    __reserved4[728];
+    uint32_t    boot_SMC_plimit;
+    uint32_t    __reserved4[727];
  
  } boot_args;
  
diff --git a/tools/tests/xnu_quick_test/tests.c b/tools/tests/xnu_quick_test/tests.c

index ad892f2aee065d935babf33ce2f6e64409d3bf34..8d1e8f460fa733a885f8f53e6965811bf3562e1a 100644 (file)
--- a/tools/tests/xnu_quick_test/tests.c
+++ b/tools/tests/xnu_quick_test/tests.c
@@ -900,7 +900,7 @@ int access_chmod_fchmod_test( void * the_argp )
  
         char *          my_pathp = NULL;
  
-       uid_t           euid,ruid;
+       uid_t           ruid;
         struct stat     my_sb;
  
         FILE *          file_handle;
@@ -987,10 +987,13 @@ int access_chmod_fchmod_test( void * the_argp )
         file_handle = fopen(FILE_NOTME, "w");
         fclose(file_handle);
  
-       /* Currently running as root (through setreuid manipulation), switch to running as the current user. */
-       euid = geteuid();
+       /* Currently running as root (through settid manipulation), switch to running as the current user. */
         ruid = getuid();
-       setreuid(ruid, ruid);
+       my_err = syscall(SYS_settid, ruid, KAUTH_GID_NONE);
+       if (my_err != 0) {
+               printf("Failed to settid to non-root with error %d:%s\n", errno, strerror(errno));
+               goto test_failed_exit;
+       }
  
         /* Create a file that the current user owns  */
         file_handle = fopen(FILE_ME, "w");
@@ -1033,8 +1036,11 @@ int access_chmod_fchmod_test( void * the_argp )
         }
  
         /* Reset to running as root */
-       setreuid(ruid, euid);
-
+       my_err = syscall(SYS_settid, KAUTH_UID_NONE, KAUTH_GID_NONE);
+       if (my_err != 0) {
+               printf("Failed to revert to root using settid with error %d:%s\n", errno, strerror(errno));
+               goto test_failed_exit;
+       }
         if(error_occurred == 1) {
                 goto test_failed_exit;
         }
@@ -5908,7 +5914,7 @@ int faccessat_fchmodat_fchmod_test( void * the_argp )
         char *          my_namep = NULL;
         char *          my_pathp = NULL;
  
-       uid_t           euid,ruid;
+       uid_t           ruid;
         struct stat     my_sb;
  
         FILE *          file_handle;
@@ -6044,10 +6050,13 @@ int faccessat_fchmodat_fchmod_test( void * the_argp )
         file_handle = fopen(FILE_NOTME, "w");
         fclose(file_handle);
  
-       /* Currently running as root (through setreuid manipulation), switch to running as the current user. */
-       euid = geteuid();
+       /* Currently running as root (through settid manipulation), switch to running as the current user. */
         ruid = getuid();
-       setreuid(ruid, ruid);
+       my_err = syscall(SYS_settid, ruid, KAUTH_GID_NONE);
+       if (my_err != 0) {
+               printf("Failed to settid to non-root with error %d:%s\n", errno, strerror(errno));
+               goto test_failed_exit;
+       }
  
         /* Create a file that the current user owns  */
         file_handle = fopen(FILE_ME, "w");
@@ -6090,7 +6099,11 @@ int faccessat_fchmodat_fchmod_test( void * the_argp )
         }
  
         /* Reset to running as root */
-       setreuid(ruid, euid);
+       my_err = syscall(SYS_settid, KAUTH_UID_NONE, KAUTH_GID_NONE);
+       if (my_err != 0) {
+               printf("Failed to settid revert to root with error %d:%s\n", errno, strerror(errno));
+               goto test_failed_exit;
+       }
  
         if(error_occurred == 1) {
                 goto test_failed_exit;
author	Apple <opensource@apple.com>
	Fri, 25 Sep 2015 15:59:39 +0000 (15:59 +0000)
committer	Apple <opensource@apple.com>
	Fri, 25 Sep 2015 15:59:39 +0000 (15:59 +0000)
bsd/conf/files		patch \| blob \| blame \| history
bsd/dev/dtrace/dtrace.c		patch \| blob \| blame \| history
bsd/hfs/hfs.h		patch \| blob \| blame \| history
bsd/hfs/hfs_cnode.c		patch \| blob \| blame \| history
bsd/hfs/hfs_fsctl.h		patch \| blob \| blame \| history
bsd/hfs/hfs_fsinfo.c	[new file with mode: 0644]	patch \| blob
bsd/hfs/hfs_readwrite.c		patch \| blob \| blame \| history
bsd/hfs/hfs_vnops.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/Catalog/FileIDsServices.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/Misc/VolumeAllocation.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/headers/FileMgrInternal.h		patch \| blob \| blame \| history
bsd/kern/bsd_init.c		patch \| blob \| blame \| history
bsd/kern/kdebug.c		patch \| blob \| blame \| history
bsd/kern/kern_control.c		patch \| blob \| blame \| history
bsd/kern/kern_event.c		patch \| blob \| blame \| history
bsd/kern/kern_exec.c		patch \| blob \| blame \| history
bsd/kern/kern_exit.c		patch \| blob \| blame \| history
bsd/kern/kern_prot.c		patch \| blob \| blame \| history
bsd/kern/kern_sysctl.c		patch \| blob \| blame \| history
bsd/kern/mach_loader.c		patch \| blob \| blame \| history
bsd/kern/mach_loader.h		patch \| blob \| blame \| history
bsd/kern/makekdebugevents.py	[new file with mode: 0755]	patch \| blob
bsd/kern/proc_info.c		patch \| blob \| blame \| history
bsd/kern/sys_generic.c		patch \| blob \| blame \| history
bsd/kern/trace.codes		patch \| blob \| blame \| history
bsd/kern/uipc_socket.c		patch \| blob \| blame \| history
bsd/man/man2/kqueue.2		patch \| blob \| blame \| history
bsd/miscfs/specfs/spec_vnops.c		patch \| blob \| blame \| history
bsd/net/if_bridge.c		patch \| blob \| blame \| history
bsd/netinet/in_systm.h		patch \| blob \| blame \| history
bsd/netinet/ip_icmp.c		patch \| blob \| blame \| history
bsd/netinet/ip_input.c		patch \| blob \| blame \| history
bsd/netinet/tcp_cubic.c		patch \| blob \| blame \| history
bsd/netinet/tcp_debug.h		patch \| blob \| blame \| history
bsd/netinet6/in6_proto.c		patch \| blob \| blame \| history
bsd/netinet6/ip6_input.c		patch \| blob \| blame \| history
bsd/nfs/nfs_bio.c		patch \| blob \| blame \| history
bsd/nfs/nfs_vfsops.c		patch \| blob \| blame \| history
bsd/sys/Makefile		patch \| blob \| blame \| history
bsd/sys/dtrace.h		patch \| blob \| blame \| history
bsd/sys/dtrace_impl.h		patch \| blob \| blame \| history
bsd/sys/event.h		patch \| blob \| blame \| history
bsd/sys/kdebug.h		patch \| blob \| blame \| history
bsd/vfs/vfs_fsevents.c		patch \| blob \| blame \| history
bsd/vfs/vfs_lookup.c		patch \| blob \| blame \| history
bsd/vfs/vfs_syscalls.c		patch \| blob \| blame \| history
config/MasterVersion		patch \| blob \| blame \| history
config/Private.exports		patch \| blob \| blame \| history
libsyscall/mach/.gitignore	[new file with mode: 0644]	patch \| blob
osfmk/atm/atm.c		patch \| blob \| blame \| history
osfmk/device/device_init.c		patch \| blob \| blame \| history
osfmk/i386/AT386/model_dep.c		patch \| blob \| blame \| history
osfmk/i386/acpi.c		patch \| blob \| blame \| history
osfmk/i386/cpuid.c		patch \| blob \| blame \| history
osfmk/i386/cpuid.h		patch \| blob \| blame \| history
osfmk/i386/panic_hooks.c		patch \| blob \| blame \| history
osfmk/i386/panic_hooks.h		patch \| blob \| blame \| history
osfmk/i386/proc_reg.h		patch \| blob \| blame \| history
osfmk/i386/trap.c		patch \| blob \| blame \| history
osfmk/i386/trap.h		patch \| blob \| blame \| history
osfmk/ipc/mach_debug.c		patch \| blob \| blame \| history
osfmk/kern/bsd_kern.c		patch \| blob \| blame \| history
osfmk/kern/debug.c		patch \| blob \| blame \| history
osfmk/kern/debug.h		patch \| blob \| blame \| history
osfmk/kern/hv_support.c		patch \| blob \| blame \| history
osfmk/kern/hv_support.h		patch \| blob \| blame \| history
osfmk/kern/sfi.c		patch \| blob \| blame \| history
osfmk/kern/sfi.h		patch \| blob \| blame \| history
osfmk/kern/startup.c		patch \| blob \| blame \| history
osfmk/kern/thread.c		patch \| blob \| blame \| history
osfmk/kern/thread.h		patch \| blob \| blame \| history
osfmk/mach/machine.h		patch \| blob \| blame \| history
osfmk/vm/vm_compressor.c		patch \| blob \| blame \| history
osfmk/vm/vm_fault.c		patch \| blob \| blame \| history
osfmk/vm/vm_map.c		patch \| blob \| blame \| history
osfmk/vm/vm_map_store.c		patch \| blob \| blame \| history
osfmk/vm/vm_map_store.h		patch \| blob \| blame \| history
osfmk/vm/vm_pageout.c		patch \| blob \| blame \| history
osfmk/vm/vm_resident.c		patch \| blob \| blame \| history
osfmk/x86_64/copyio.c		patch \| blob \| blame \| history
osfmk/x86_64/pmap.c		patch \| blob \| blame \| history
pexpert/pexpert/i386/boot.h		patch \| blob \| blame \| history
tools/tests/xnu_quick_test/tests.c		patch \| blob \| blame \| history