#include <i386/cpuid.h>
 #include <i386/tsc.h>
 #include <i386/machine_routines.h>
+#include <i386/pal_routines.h>
 #include <i386/ucode.h>
 #include <kern/clock.h>
 #include <libkern/libkern.h>
 #include <i386/lapic.h>
+#include <i386/pmCPU.h>
+
 
 static int
 _i386_cpu_info SYSCTL_HANDLER_ARGS
 
 SYSCTL_NODE(_machdep, OID_AUTO, tsc, CTLFLAG_RD|CTLFLAG_LOCKED, NULL, "Timestamp counter parameters");
 
-SYSCTL_QUAD(_machdep_tsc, OID_AUTO, frequency, CTLFLAG_RD|CTLFLAG_LOCKED, &tscFreq, "");
+SYSCTL_QUAD(_machdep_tsc, OID_AUTO, frequency,
+       CTLFLAG_RD|CTLFLAG_LOCKED, &tscFreq, "");
+
+extern uint32_t deep_idle_rebase;
+SYSCTL_UINT(_machdep_tsc, OID_AUTO, deep_idle_rebase,
+       CTLFLAG_RW|CTLFLAG_KERN|CTLFLAG_LOCKED, &deep_idle_rebase, 0, "");
+
+SYSCTL_NODE(_machdep_tsc, OID_AUTO, nanotime,
+       CTLFLAG_RD|CTLFLAG_LOCKED, NULL, "TSC to ns conversion");
+SYSCTL_QUAD(_machdep_tsc_nanotime, OID_AUTO, tsc_base,
+       CTLFLAG_RD | CTLFLAG_LOCKED,
+       (uint64_t *) &pal_rtc_nanotime_info.tsc_base, "");
+SYSCTL_QUAD(_machdep_tsc_nanotime, OID_AUTO, ns_base,
+       CTLFLAG_RD | CTLFLAG_LOCKED,
+       (uint64_t *)&pal_rtc_nanotime_info.ns_base, "");
+SYSCTL_UINT(_machdep_tsc_nanotime, OID_AUTO, scale,
+       CTLFLAG_RD | CTLFLAG_LOCKED,
+       (uint32_t *)&pal_rtc_nanotime_info.scale, 0, "");
+SYSCTL_UINT(_machdep_tsc_nanotime, OID_AUTO, shift,
+       CTLFLAG_RD | CTLFLAG_LOCKED,
+       (uint32_t *)&pal_rtc_nanotime_info.shift, 0, "");
+SYSCTL_UINT(_machdep_tsc_nanotime, OID_AUTO, generation,
+       CTLFLAG_RD | CTLFLAG_LOCKED,
+       (uint32_t *)&pal_rtc_nanotime_info.generation, 0, "");
 
 SYSCTL_NODE(_machdep, OID_AUTO, misc, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
        "Miscellaneous x86 kernel parameters");
 
 #include <dev/random/YarrowCoreLib/include/yarrow.h>
 
 #include <libkern/OSByteOrder.h>
+#include <libkern/OSAtomic.h>
 
 #include <mach/mach_time.h>
 #include <machine/machine_routines.h>
 
 
 /* Used to detect whether we've already been initialized */
-static int gRandomInstalled = 0;
+static UInt8 gRandomInstalled = 0;
 static PrngRef gPrngRef;
 static int gRandomError = 1;
 static lck_grp_t *gYarrowGrp;
 static lck_attr_t *gYarrowAttr;
 static lck_grp_attr_t *gYarrowGrpAttr;
 static lck_mtx_t *gYarrowMutex = 0;
+static UInt8 gYarrowInitializationLock = 0;
 
 #define RESEED_TICKS 50 /* how long a reseed operation can take */
 
 {
     prng_error_status perr;
 
+       /* Multiple threads can enter this as a result of an earlier
+        * check of gYarrowMutex.  We make sure that only one of them
+        * can enter at a time.  If one of them enters and discovers
+        * that gYarrowMutex is no longer NULL, we know that another
+        * thread has initialized the Yarrow state and we can exit.
+        */
+       
+       /* The first thread that enters this function will find
+        * gYarrowInitializationLock set to 0.  It will atomically
+        * set the value to 1 and, seeing that it was zero, drop
+        * out of the loop.  Other threads will see that the value is
+        * 1 and continue to loop until we are initialized.
+     */
+
+       while (OSTestAndSet(0, &gYarrowInitializationLock)); /* serialize access to this function */
+       
+       if (gYarrowMutex) {
+               /*  we've already been initialized, clear and get out */
+               goto function_exit;
+       }
+
     /* create a Yarrow object */
     perr = prngInitialize(&gPrngRef);
     if (perr != 0) {
     char buffer [16];
 
     /* get a little non-deterministic data as an initial seed. */
+       /* On OSX, securityd will add much more entropy as soon as it */
+       /* comes up.  On iOS, entropy is added with each system interrupt. */
     microtime(&tt);
 
     /*
     if (perr != 0) {
         /* an error, complain */
         printf ("Couldn't seed Yarrow.\n");
-        return;
+        goto function_exit;
     }
     
     /* turn the data around */
     gYarrowMutex   = lck_mtx_alloc_init(gYarrowGrp, gYarrowAttr);
        
        fips_initialize ();
+
+function_exit:
+       /* allow other threads to figure out whether or not we have been initialized. */
+       gYarrowInitializationLock = 0;
 }
 
 const Block kKnownAnswer = {0x92, 0xb4, 0x04, 0xe5, 0x56, 0x58, 0x8c, 0xed, 0x6c, 0x1a, 0xcd, 0x4e, 0xbf, 0x05, 0x3f, 0x68, 0x09, 0xf7, 0x3a, 0x93};
 {
        int ret;
 
-       if (gRandomInstalled)
+       if (OSTestAndSet(0, &gRandomInstalled)) {
+               /* do this atomically so that it works correctly with
+                multiple threads */
                return;
-
-       /* install us in the file system */
-       gRandomInstalled = 1;
-
-       /* setup yarrow and the mutex */
-       PreliminarySetup();
+       }
 
        ret = cdevsw_add(RANDOM_MAJOR, &random_cdevsw);
        if (ret < 0) {
         */
        devfs_make_node(makedev (ret, 1), DEVFS_CHAR,
                UID_ROOT, GID_WHEEL, 0666, "urandom", 0);
+
+       /* setup yarrow and the mutex if needed*/
+       PreliminarySetup();
 }
 
 int
 
                (cp->c_flag & C_DELETED) &&
                ((forkcount == 1) || (!VNODE_IS_RSRC(vp)))) {
 
-               /* Start a transaction here.  We're about to change file sizes */
-               if (started_tr == 0) {
-                       if (hfs_start_transaction(hfsmp) != 0) {
-                               error = EINVAL;
-                               goto out;
-                       }
-                       else {
-                               started_tr = 1;
-                       }
-               }
-       
                /* Truncate away our own fork data. (Case A, B, C above) */
                if (VTOF(vp)->ff_blocks != 0) {
-                       
+
+                       /* 
+                        * SYMLINKS only:
+                        *
+                        * Encapsulate the entire change (including truncating the link) in 
+                        * nested transactions if we are modifying a symlink, because we know that its
+                        * file length will be at most 4k, and we can fit both the truncation and 
+                        * any relevant bitmap changes into a single journal transaction.  We also want
+                        * the kill_block code to execute in the same transaction so that any dirty symlink
+                        * blocks will not be written. Otherwise, rely on
+                        * hfs_truncate doing its own transactions to ensure that we don't blow up
+                        * the journal.
+                        */ 
+                       if ((started_tr == 0) && (v_type == VLNK)) {
+                               if (hfs_start_transaction(hfsmp) != 0) {
+                                       error = EINVAL;
+                                       goto out;
+                               }
+                               else {
+                                       started_tr = 1;
+                               }
+                       }
+
                        /*
                         * At this point, we have decided that this cnode is
                         * suitable for full removal.  We are about to deallocate
                        if (hfsmp->jnl && vnode_islnk(vp)) {
                                buf_iterate(vp, hfs_removefile_callback, BUF_SKIP_NONLOCKED, (void *)hfsmp);
                        }
-
+       
                        /*
-                        * Since we're already inside a transaction,
-                        * tell hfs_truncate to skip the ubc_setsize.
-                        *
                         * This truncate call (and the one below) is fine from VNOP_RECLAIM's 
                         * context because we're only removing blocks, not zero-filling new 
                         * ones.  The C_DELETED check above makes things much simpler. 
                         */
-                       error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 1, 0, ctx);
+                       error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 0, 0, ctx);
                        if (error) {
                                goto out;
                        }
                        truncated = 1;
+
+                       /* (SYMLINKS ONLY): Close/End our transaction after truncating the file record */
+                       if (started_tr) {
+                               hfs_end_transaction(hfsmp);
+                               started_tr = 0;
+                       }
                }
                
                /* 
                 * it is the last fork.  That means, by definition, the rsrc fork is not in 
                 * core.  To avoid bringing a vnode into core for the sole purpose of deleting the
                 * data in the resource fork, we call cat_lookup directly, then hfs_release_storage
-                * to get rid of the resource fork's data. 
+                * to get rid of the resource fork's data. Note that because we are holding the 
+                * cnode lock, it is impossible for a competing thread to create the resource fork
+                * vnode from underneath us while we do this.
                 * 
                 * This is invoked via case A above only.
                 */
                         */
                        cp->c_blocks = 0;
                }
-
-               /* End the transaction from the start of the file truncation segment */
-               if (started_tr) {
-                       hfs_end_transaction(hfsmp);
-                       started_tr = 0;
-               }
        }
        
        /*
 
 /*
- * Copyright (c) 2003-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2013 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
                    (SWAP_BE32 (hotfileinfo.timeleft) > 0) &&
                    (SWAP_BE32 (hotfileinfo.timebase) > 0)) {
                        hfsmp->hfc_maxfiles = SWAP_BE32 (hotfileinfo.maxfilecnt);
-                       hfsmp->hfc_timeout = SWAP_BE32 (hotfileinfo.timeleft) + tv.tv_sec ;
                        hfsmp->hfc_timebase = SWAP_BE32 (hotfileinfo.timebase);
+                       hfsmp->hfc_timeout = SWAP_BE32 (hotfileinfo.timeleft) + tv.tv_sec ;
                        /* Fix up any bogus timebase values. */
                        if (hfsmp->hfc_timebase < HFC_MIN_BASE_TIME) {
                                hfsmp->hfc_timebase = hfsmp->hfc_timeout - HFC_DEFAULT_DURATION;
        if (hfsmp->hfc_stage != HFC_RECORDING)
                return (0);
 
-       if ((!vnode_isreg(vp) && !vnode_islnk(vp)) || vnode_issystem(vp)) {
+       /* Only regular files are allowed for hotfile inclusion ; symlinks disallowed */
+       if ((!vnode_isreg(vp)) || vnode_issystem(vp)) {
                return (0);
        }
        /* Skip resource forks for now. */
        if (hfsmp->hfc_stage != HFC_RECORDING)
                return (0);
 
-       if ((!vnode_isreg(vp) && !vnode_islnk(vp)) || vnode_issystem(vp)) {
+       /* Only regular files can move out of hotfiles */
+       if ((!vnode_isreg(vp)) || vnode_issystem(vp)) {
                return (0);
        }
 
 static int
 hotfiles_collect_callback(struct vnode *vp, __unused void *cargs)
 {
-        if ((vnode_isreg(vp) || vnode_islnk(vp)) && !vnode_issystem(vp))
+        if ((vnode_isreg(vp)) && !vnode_issystem(vp))
                (void) hfs_addhotfile_internal(vp);
 
        return (VNODE_RETURNED);
                        }
                        break;
                }
-               if (!vnode_isreg(vp) && !vnode_islnk(vp)) {
+
+               /* only regular files are eligible */
+               if (!vnode_isreg(vp)) { 
                        printf("hfs: hotfiles_adopt: huh, not a file %d (%d)\n", listp->hfl_hotfile[i].hf_fileid, VTOC(vp)->c_cnid);
                        hfs_unlock(VTOC(vp));
                        vnode_put(vp);
                        }
                        break;
                }
-               if (!vnode_isreg(vp) && !vnode_islnk(vp)) {
+
+               /* only regular files are eligible */
+               if (!vnode_isreg(vp)) {
                        printf("hfs: hotfiles_evict: huh, not a file %d\n", key->fileID);
                        hfs_unlock(VTOC(vp));
                        vnode_put(vp);
 
 /*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
        enum vtype vnodetype;
 
        vnodetype = vnode_vtype(vp);
-       if (vnodetype != VREG && vnodetype != VLNK) {
+       if (vnodetype != VREG) {
+               /* Note symlinks are not allowed to be relocated */
                return (EPERM);
        }
        
        if (blockHint == 0)
                blockHint = hfsmp->nextAllocation;
 
-       if ((fp->ff_size > 0x7fffffff) ||
-           ((fp->ff_size > blksize) && vnodetype == VLNK)) {
+       if ((fp->ff_size > 0x7fffffff)) {
                return (EFBIG);
        }
 
 
 }
 
 void
-start_kern_tracing(unsigned int new_nkdbufs) {
+start_kern_tracing(unsigned int new_nkdbufs, boolean_t need_map) {
 
        if (!new_nkdbufs)
                return;
        nkdbufs = kdbg_set_nkdbufs(new_nkdbufs);
        kdbg_lock_init();
        kdbg_reinit(TRUE);
+    if (need_map == TRUE)
+       kdbg_mapinit();
        kdbg_set_tracing_enabled(TRUE, KDEBUG_ENABLE_TRACE);
 
 #if defined(__i386__) || defined(__x86_64__)
 
 SYSCTL_INT (_hw, OID_AUTO, cputhreadtype, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &cputhreadtype, 0, "");
 
 #if defined(__i386__) || defined(__x86_64__)
-int mmx_flag = -1;
-int sse_flag = -1;
-int sse2_flag = -1;
-int sse3_flag = -1;
-int sse4_1_flag = -1;
-int sse4_2_flag = -1;
-int x86_64_flag = -1;
-int supplementalsse3_flag = -1;
-int aes_flag = -1;
-int avx1_0_flag = -1;
-int rdrand_flag = -1;
-int f16c_flag = -1;
-int enfstrg_flag = -1;
-
-SYSCTL_INT(_hw_optional, OID_AUTO, mmx, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &mmx_flag, 0, "");
-SYSCTL_INT(_hw_optional, OID_AUTO, sse, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse_flag, 0, "");
-SYSCTL_INT(_hw_optional, OID_AUTO, sse2, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse2_flag, 0, "");
-SYSCTL_INT(_hw_optional, OID_AUTO, sse3, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse3_flag, 0, "");
-SYSCTL_INT(_hw_optional, OID_AUTO, supplementalsse3, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &supplementalsse3_flag, 0, "");
-SYSCTL_INT(_hw_optional, OID_AUTO, sse4_1, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse4_1_flag, 0, "");
-SYSCTL_INT(_hw_optional, OID_AUTO, sse4_2, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse4_2_flag, 0, "");
+static int
+sysctl_cpu_capability
+(__unused struct sysctl_oid *oidp, void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       uint64_t        mask = (uint64_t) (uintptr_t) arg1;
+       boolean_t       is_capable = (_get_cpu_capabilities() & mask) != 0;
+ 
+       return SYSCTL_OUT(req, &is_capable, sizeof(is_capable));
+
+}
+
+SYSCTL_PROC(_hw_optional, OID_AUTO, mmx,       CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasMMX, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, sse,       CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSSE, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, sse2,      CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSSE2, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, sse3,      CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSSE3, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, supplementalsse3,  CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSupplementalSSE3, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, sse4_1,    CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSSE4_1, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, sse4_2,    CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSSE4_2, 0, sysctl_cpu_capability, "I", "");
 /* "x86_64" is actually a preprocessor symbol on the x86_64 kernel, so we have to hack this */
 #undef x86_64
-SYSCTL_INT(_hw_optional, OID_AUTO, x86_64, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &x86_64_flag, 0, "");
-SYSCTL_INT(_hw_optional, OID_AUTO, aes, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &aes_flag, 0, "");
-SYSCTL_INT(_hw_optional, OID_AUTO, avx1_0, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &avx1_0_flag, 0, "");
-SYSCTL_INT(_hw_optional, OID_AUTO, rdrand, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &rdrand_flag, 0, "");
-SYSCTL_INT(_hw_optional, OID_AUTO, f16c, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &f16c_flag, 0, "");
-SYSCTL_INT(_hw_optional, OID_AUTO, enfstrg, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &enfstrg_flag, 0, "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, x86_64,    CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) k64Bit, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, aes,       CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAES, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, avx1_0,    CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAVX1_0, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, rdrand,    CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasRDRAND, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, f16c,      CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasF16C, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, enfstrg,   CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasENFSTRG, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, fma,       CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasFMA, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, avx2_0,    CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAVX2_0, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, bmi1,      CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasBMI1, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, bmi2,      CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasBMI2, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, rtm,       CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasRTM, 0, sysctl_cpu_capability, "I", "");
+SYSCTL_PROC(_hw_optional, OID_AUTO, hle,       CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasHLE, 0, sysctl_cpu_capability, "I", "");
 #else
 #error Unsupported arch
 #endif /* !__i386__ && !__x86_64 && !__arm__ */
        }
 
 #if defined (__i386__) || defined (__x86_64__)
-#define is_capability_set(k) (((_get_cpu_capabilities() & (k)) == (k)) ? 1 : 0)
-       mmx_flag                = is_capability_set(kHasMMX);
-       sse_flag                = is_capability_set(kHasSSE);
-       sse2_flag               = is_capability_set(kHasSSE2);
-       sse3_flag               = is_capability_set(kHasSSE3);
-       supplementalsse3_flag   = is_capability_set(kHasSupplementalSSE3);
-       sse4_1_flag             = is_capability_set(kHasSSE4_1);
-       sse4_2_flag             = is_capability_set(kHasSSE4_2);
-       x86_64_flag             = is_capability_set(k64Bit);
-       aes_flag                = is_capability_set(kHasAES);
-       avx1_0_flag             = is_capability_set(kHasAVX1_0);
-       rdrand_flag             = is_capability_set(kHasRDRAND);
-       f16c_flag               = is_capability_set(kHasF16C);
-       enfstrg_flag            = is_capability_set(kHasENFSTRG);
-
        /* hw.cpufamily */
        cpufamily = cpuid_cpufamily();
 
 
     int                                isssd = 0;
     uint32_t                    flags = 0;
     uint32_t                   blksize;
-    off_t                      maxiocount, count;
+    off_t                      maxiocount, count, segcount;
     boolean_t                   locked = FALSE;
 
     int (*do_ioctl)(void * p1, void * p2, u_long theIoctl, caddr_t result);
         maxiocount = count;
 
     error = do_ioctl(p1, p2, DKIOCGETMAXSEGMENTBYTECOUNTREAD, (caddr_t) &count);
+    if (!error)
+       error = do_ioctl(p1, p2, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t) &segcount);
     if (error)
-        count = 0;
+        count = segcount = 0;
+    count *= segcount;
     if (count && (count < maxiocount))
         maxiocount = count;
 
     error = do_ioctl(p1, p2, DKIOCGETMAXSEGMENTBYTECOUNTWRITE, (caddr_t) &count);
+    if (!error)
+       error = do_ioctl(p1, p2, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t) &segcount);
     if (error)
-        count = 0;
+        count = segcount = 0;
+    count *= segcount;
     if (count && (count < maxiocount))
         maxiocount = count;
 
 
 0x1400058      MACH_SCHED_REDISPATCH
 0x140005C      MACH_SCHED_REMOTE_AST
 0x1400060      MACH_SCHED_LPA_BROKEN
+0x1400064      MACH_DEEP_IDLE
 0x1500000      MACH_MSGID_INVALID
 0x1600000      MTX_SLEEP
 0x1600004      MTX_SLEEP_DEADLINE
 0x1700020      PMAP_flush_TLBS
 0x1700024      PMAP_update_interrupt
 0x1700028      PMAP_attribute_clear
+0x1900000      MP_TLB_FLUSH
+0x1900004      MP_CPUS_CALL
+0x1900008      MP_CPUS_CALL_LOCAL
+0x190000c      MP_CPUS_CALL_ACTION
+0x1900010      MP_CPUS_CALL_NOBUF
+0x1900014      MP_CPU_FAST_START
+0x1900018      MP_CPU_START
+0x190001c      MP_CPU_DEACTIVATE
 0x2010000      L_IP_In_Beg
 0x2010004      L_IP_Out_Beg
 0x2010008      L_IP_In_End
 0x53101a4      CPUPM_TEST_RUN_INFO
 0x53101a8      CPUPM_TEST_SLAVE_INFO
 0x53101ac      CPUPM_FORCED_IDLE
+0x53101b4      CPUPM_PSTATE_CHOOSE
+0x53101b8      CPUPM_PSTATE_COMMIT
+0x53101bc      CPUPM_PSTATE_CHECK
+0x531023C      CPUPM_TQM       
+0x5310240      CPUPM_QUIESCE
+0x5310244      CPUPM_MBD
+0x5310248      CPUPM_PST_RATELIMIT_QOS
+0x531024C      CPUPM_PST_QOS_RATEUNLIMIT
+0x5310250      CPUPM_PST_QOS_SWITCH
+0x5310254      CPUPM_FORCED_IDLE
+0x531023C      CPUPM_TQM       
+0x5310240      CPUPM_QUIESCE
+0x5310244      CPUPM_MBD
+0x5310248      CPUPM_PST_RATELIMIT_QOS
+0x531024C      CPUPM_PST_QOS_RATEUNLIMIT
+0x5310250      CPUPM_PST_QOS_SWITCH
+0x5310254      CPUPM_FORCED_IDLE
+0x5320000      CPUPM_PST_RESOLVE
+0x5320004      CPUPM_PST_LOAD_TXFR
+0x5320008      CPUPM_PST_IDLE_EXIT
+0x532000C      CPUPM_PST_IDLE_ENTRY
+0x5320010      CPUPM_PST_TIMER
+0x5320014      CPUPM_PST_MAXBUS
+0x5320018      CPUPM_PST_MAXINT
+0x532001C      CPUPM_PST_PLIMIT
+0x5320020      CPUPM_PST_SELFSEL
+0x5320024      CPUPM_PST_RATELIMIT
+0x5320028      CPUPM_PST_RATEUNLIMIT
+0x532002C      CPUPM_DVFS_PAUSE
+0x5320030      CPUPM_DVFS_RESUME
+0x5320034      CPUPM_DVFS_ADVANCE
+0x5320038      CPUPM_DVFS_TRANSIT
 0x5330000      HIBERNATE
 0x5330004      HIBERNATE_WRITE_IMAGE
 0x5330008      HIBERNATE_MACHINE_INIT
 
 {
 #pragma unused(err)
        struct sfb_bin_fcentry *fce;
-       struct inp_fc_entry *infc;
+       struct inpcb *inp;
 
        for (;;) {
                lck_mtx_assert(&ifnet_fclist_lock, LCK_MTX_ASSERT_OWNED);
                SLIST_NEXT(fce, fce_link) = NULL;
                lck_mtx_unlock(&ifnet_fclist_lock);
 
-               infc = inp_fc_getinp(fce->fce_flowhash);
-               if (infc == NULL) {
+               inp = inp_fc_getinp(fce->fce_flowhash, 0);
+               if (inp == NULL) {
                        ifnet_fce_free(fce);
                        lck_mtx_lock_spin(&ifnet_fclist_lock);
                        continue;
                }
-               VERIFY(infc->infc_inp != NULL);
+               inp_fc_feedback(inp);
 
-               inp_fc_feedback(infc->infc_inp);
-
-               inp_fc_entry_free(infc);
                ifnet_fce_free(fce);
                lck_mtx_lock_spin(&ifnet_fclist_lock);
        }
 
                OIGMPSTAT_INC(igps_rcv_tooshort);
                return;
        }
-       VERIFY(IS_P2ALIGNED(igmp, sizeof (u_int32_t)));
+       /* N.B.: we assume the packet was correctly aligned in ip_input. */
 
        /*
         * Validate checksum.
                                        OIGMPSTAT_INC(igps_rcv_tooshort);
                                        return;
                                }
-                               VERIFY(IS_P2ALIGNED(igmpv3,
-                                   sizeof (u_int32_t)));
+                               /* 
+                                * N.B.: we assume the packet was correctly
+                                * aligned in ip_input.
+                                */
                                if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
                                        m_freem(m);
                                        return;
 
 
 u_int32_t inp_hash_seed = 0;
 
-static __inline int infc_cmp(const struct inp_fc_entry *,
-    const struct inp_fc_entry *);
+static __inline int infc_cmp(const struct inpcb *,
+    const struct inpcb *);
 lck_grp_t *inp_lck_grp;
 lck_grp_attr_t *inp_lck_grp_attr;
 lck_attr_t *inp_lck_attr;
 decl_lck_mtx_data(, inp_fc_lck);
 
-RB_HEAD(inp_fc_tree, inp_fc_entry) inp_fc_tree;
-RB_PROTOTYPE(inp_fc_tree, inp_fc_entry, infc_link, infc_cmp);
+RB_HEAD(inp_fc_tree, inpcb) inp_fc_tree;
+RB_PROTOTYPE(inp_fc_tree, inpcb, infc_link, infc_cmp);
+RB_GENERATE(inp_fc_tree, inpcb, infc_link, infc_cmp);
 
-RB_GENERATE(inp_fc_tree, inp_fc_entry, infc_link, infc_cmp);
-
-static unsigned int inp_fcezone_size;
-static struct zone *inp_fcezone;
-#define INP_FCEZONE_NAME "inp_fcezone"
-#define INP_FCEZONE_MAX 32
+/*
+ * Use this inp as a key to find an inp in the flowhash tree.
+ * Accesses to it are protected by inp_fc_lck.
+ */
+struct inpcb key_inp;
 
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
        inp_lck_attr = lck_attr_alloc_init();
        lck_mtx_init(&inp_fc_lck, inp_lck_grp, inp_lck_attr);
 
+       lck_mtx_lock(&inp_fc_lck);
        RB_INIT(&inp_fc_tree);
-
-       inp_fcezone_size = P2ROUNDUP(sizeof (struct inp_fc_entry),
-           sizeof (u_int64_t));
-       inp_fcezone = zinit(inp_fcezone_size,
-           INP_FCEZONE_MAX * inp_fcezone_size, 0, INP_FCEZONE_NAME);
-       if (inp_fcezone == NULL) {
-               panic("%s: failed allocating %s", __func__,
-                   INP_FCEZONE_NAME);
-               /* NOTREACHED */
-       }
-       zone_change(inp_fcezone, Z_EXPAND, TRUE);
-       zone_change(inp_fcezone, Z_CALLERACCT, FALSE);
+       bzero(&key_inp, sizeof(key_inp));
+       lck_mtx_unlock(&inp_fc_lck);
 }
 
 /*
 void
 in_pcbremlists(struct inpcb *inp)
 {
-       struct inp_fc_entry *infce;
        inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
 
        if (inp->inp_lport) {
        }
        LIST_REMOVE(inp, inp_list);
 
-       infce = inp_fc_getinp(inp->inp_flowhash);
-       if (infce != NULL)
-               inp_fc_entry_free(infce);
-
+       if (inp->inp_flags2 & INP2_IN_FCTREE) {
+               inp_fc_getinp(inp->inp_flowhash,
+                       (INPFC_SOLOCKED|INPFC_REMOVE));
+               VERIFY(!(inp->inp_flags2 & INP2_IN_FCTREE));
+       }
        inp->inp_pcbinfo->ipi_count--;
 }
 
 {
        struct inp_flowhash_key fh __attribute__((aligned(8)));
        u_int32_t flowhash = 0;
+       struct inpcb *tmp_inp = NULL;
 
        if (inp_hash_seed == 0)
                inp_hash_seed = RandomULong();
                goto try_again;
        }
 
-       return flowhash;
-}
+       inp->inp_flowhash = flowhash;
 
-/*
- * Function to compare inp_fc_entries in inp flow control tree
- */
-static inline int
-infc_cmp(const struct inp_fc_entry *fc1, const struct inp_fc_entry *fc2)
-{
-       return (fc1->infc_flowhash - fc2->infc_flowhash);
-}
-
-int
-inp_fc_addinp(struct inpcb *inp)
-{
-       struct inp_fc_entry keyfc, *infc;
-       u_int32_t flowhash = inp->inp_flowhash;
-
-       keyfc.infc_flowhash = flowhash;
-
-       lck_mtx_lock_spin(&inp_fc_lck);
-       infc = RB_FIND(inp_fc_tree, &inp_fc_tree, &keyfc);
-       if (infc != NULL && infc->infc_inp == inp) {
-               /* Entry is already in inp_fc_tree, return */
-               lck_mtx_unlock(&inp_fc_lck);
-               return (1);
-       }
+       /* Insert the inp into inp_fc_tree */
 
-       if (infc != NULL) {
+       lck_mtx_lock(&inp_fc_lck);
+       tmp_inp = RB_FIND(inp_fc_tree, &inp_fc_tree, inp);
+       if (tmp_inp != NULL) {
                /*
-                * There is a different fc entry with the same
-                * flow hash but different inp pointer. There
-                * can be a collision on flow hash but the
-                * probability is low. Let's just avoid
-                * adding a second one when there is a collision
+                * There is a different inp with the same flowhash.
+                * There can be a collision on flow hash but the
+                * probability is low. Let's recompute the
+                * flowhash.
                 */
                lck_mtx_unlock(&inp_fc_lck);
-               return (0);
-       }
-
-       /* become regular mutex */
-       lck_mtx_convert_spin(&inp_fc_lck);
-
-       infc = zalloc_noblock(inp_fcezone);
-       if (infc == NULL) {
-               /* memory allocation failed */
-               lck_mtx_unlock(&inp_fc_lck);
-               return (0);
+               /* recompute hash seed */
+               inp_hash_seed = RandomULong();
+               goto try_again;
        }
-       bzero(infc, sizeof (*infc));
-
-       infc->infc_flowhash = flowhash;
-       infc->infc_inp = inp;
-
-       RB_INSERT(inp_fc_tree, &inp_fc_tree, infc);
+       RB_INSERT(inp_fc_tree, &inp_fc_tree, inp);
+       inp->inp_flags2 |= INP2_IN_FCTREE;
        lck_mtx_unlock(&inp_fc_lck);
-       return (1);
+
+       return flowhash;
 }
 
-struct inp_fc_entry*
-inp_fc_getinp(u_int32_t flowhash)
+/*
+ * Function to compare inp_fc_entries in inp flow control tree
+ */
+static inline int
+infc_cmp(const struct inpcb *inp1, const struct inpcb *inp2)
 {
-       struct inp_fc_entry keyfc, *infc;
+       return (memcmp(&(inp1->inp_flowhash), &(inp2->inp_flowhash),
+               sizeof(inp1->inp_flowhash)));
+}
 
-       keyfc.infc_flowhash = flowhash;
+struct inpcb *
+inp_fc_getinp(u_int32_t flowhash, u_int32_t flags)
+{
+       struct inpcb *inp = NULL;
+       int locked = (flags & INPFC_SOLOCKED) ? 1 : 0;
 
        lck_mtx_lock_spin(&inp_fc_lck);
-       infc = RB_FIND(inp_fc_tree, &inp_fc_tree, &keyfc);
-       if (infc == NULL) {
+       key_inp.inp_flowhash = flowhash;
+       inp = RB_FIND(inp_fc_tree, &inp_fc_tree, &key_inp);
+       if (inp == NULL) {
                /* inp is not present, return */
                lck_mtx_unlock(&inp_fc_lck);
                return (NULL);
        }
 
-       RB_REMOVE(inp_fc_tree, &inp_fc_tree, infc);
-
-       if (in_pcb_checkstate(infc->infc_inp, WNT_ACQUIRE, 0) ==
-           WNT_STOPUSING) {
-               /* become regular mutex */
-               lck_mtx_convert_spin(&inp_fc_lck);
+       if (flags & INPFC_REMOVE) {
+               RB_REMOVE(inp_fc_tree, &inp_fc_tree, inp);
+               lck_mtx_unlock(&inp_fc_lck);
 
-               /*
-                * This inp is going away, just don't process it.
-                */
-               inp_fc_entry_free(infc);
-               infc = NULL;
+               bzero(&(inp->infc_link), sizeof (inp->infc_link));
+               inp->inp_flags2 &= ~INP2_IN_FCTREE;
+               return (NULL);
        }
+       if (in_pcb_checkstate(inp, WNT_ACQUIRE, locked) == WNT_STOPUSING)
+               inp = NULL;
        lck_mtx_unlock(&inp_fc_lck);
 
-       return (infc);
-}
-
-void
-inp_fc_entry_free(struct inp_fc_entry *infc)
-{
-       zfree(inp_fcezone, infc);
+       return (inp);
 }
 
 void
 int
 inp_set_fc_state(struct inpcb *inp, int advcode)
 {
+       struct inpcb *tmp_inp = NULL;
        /*
         * If there was a feedback from the interface when 
         * send operation was in progress, we should ignore
                return(0);
 
        inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
-       if (inp_fc_addinp(inp)) {
+       if ((tmp_inp = inp_fc_getinp(inp->inp_flowhash, INPFC_SOLOCKED)) 
+               != NULL) {
+               if (in_pcb_checkstate(tmp_inp, WNT_RELEASE, 1)
+                       == WNT_STOPUSING)
+                       return (0);
+               VERIFY(tmp_inp == inp);
                switch (advcode) {
                case FADV_FLOW_CONTROLLED:
                        inp->inp_flags |= INP_FLOW_CONTROLLED;
                        inp->inp_socket->so_flags |= SOF_SUSPENDED;
                        break;
                }
+               return (1);
        }
-       return(1);
+       return(0);
 }
 
 /*
 
 #endif
 struct ifnet;
 
-#ifdef BSD_KERNEL_PRIVATE
-/* Flow control entry per socket */
-struct inp_fc_entry {
-       RB_ENTRY(inp_fc_entry) infc_link;
-       u_int32_t infc_flowhash;
-       struct inpcb *infc_inp;
-};
-#endif /* BSD_KERNEL_PRIVATE */
-
 struct inp_stat {
        u_int64_t       rxpackets;
        u_int64_t       rxbytes;
        struct  socket *inp_socket;     /* back pointer to socket */
        u_int32_t nat_cookie;           /* Cookie stored and returned to NAT */
        LIST_ENTRY(inpcb) inp_portlist; /* list for this PCB's local port */
+       RB_ENTRY(inpcb) infc_link;      /* link for flowhash RB tree */
        struct  inpcbport *inp_phd;     /* head of this list */
        inp_gen_t inp_gencnt;           /* generation count of this instance */
        u_int32_t inp_flags;            /* generic IP/datagram flags */
+       u_int32_t inp_flags2;           /* generic IP/datagram flags #2 */
        u_int32_t inp_flow;
 
        u_char  inp_sndinprog_cnt;      /* outstanding send operations */
 #define        IN6P_RECV_ANYIF         INP_RECV_ANYIF
 #define        IN6P_CONTROLOPTS INP_CONTROLOPTS
 #define        IN6P_NO_IFT_CELLULAR    INP_NO_IFT_CELLULAR
+
+/* Overflowed INP flags; use INP2 prefix to avoid misuse */
+#define INP2_IN_FCTREE         0x2     /* in inp_fc_tree */
        /*
         * socket AF version is {newer than,or include}
         * actual datagram AF version
 extern int     inp_nocellular(struct inpcb *, unsigned int);
 extern u_int32_t inp_calc_flowhash(struct inpcb *);
 extern void    socket_flowadv_init(void);
-extern int     inp_fc_addinp(struct inpcb *);
-extern struct inp_fc_entry *inp_fc_getinp(u_int32_t);
-extern void    inp_fc_entry_free(struct inp_fc_entry *);
+
+/* Flags used by inp_fc_getinp */
+#define INPFC_SOLOCKED 0x1
+#define INPFC_REMOVE   0x2
+extern struct inpcb *inp_fc_getinp(u_int32_t, u_int32_t);
 extern void    inp_fc_feedback(struct inpcb *);
 extern void    inp_reset_fc_state(struct inpcb *);
 extern int     inp_set_fc_state(struct inpcb *, int advcode);
 
 #define CP_READ_ACCESS         0x1
 #define CP_WRITE_ACCESS 0x2
 
+/* 
+ * Check for this version when deciding to enable features
+ */
 #define CONTENT_PROTECTION_XATTR_NAME  "com.apple.system.cprotect"
 #define CP_NEW_MAJOR_VERS 4
 #define CP_PREV_MAJOR_VERS 2
 
 #define        MACH_REMOTE_AST         0x17    /* AST signal issued to remote processor */
 
 #define        MACH_SCHED_LPA_BROKEN   0x18    /* last_processor affinity broken in choose_processor */
+#define MACH_DEEP_IDLE          0x19   /* deep idle on master processor */
 
 /* Codes for pmap (DBG_MACH_PMAP) */     
 #define PMAP__CREATE           0x0
 #define DBG_DRVSD              19      /* Secure Digital */
 #define DBG_DRVNAND            20      /* NAND drivers and layers */
 #define DBG_SSD                        21      /* SSD */
+#define DBG_DRVSPI             22      /* SPI */
 
 /* Backwards compatibility */
 #define        DBG_DRVPOINTING         DBG_DRVHID              /* OBSOLETE: Use DBG_DRVHID instead */
 extern void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *arg4);
 
 extern void kdbg_dump_trace_to_file(const char *);
-void start_kern_tracing(unsigned int);
+void start_kern_tracing(unsigned int, boolean_t);
 struct task;
 extern void kdbg_get_task_name(char*, int, struct task *task);
 void disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags);
 
-12.4.0
+12.5.0
 
 # The first line of this file contains the master version number for the kernel.
 # All other instances of the kernel version in xnu are derived from this file.
 
 __ZN22IOInterruptEventSource7warmCPUEy
 _acpi_install_wake_handler
 _acpi_sleep_kernel
+_acpi_idle_kernel
 _add_fsevent
 _apic_table
 _apply_func_phys
 
     uint32_t   sleepTime;
     uint32_t    compression;
 
-    uint32_t   reserved[68];           // make sizeof == 512
+    uint32_t   reserved[62];           // make sizeof == 512
+
+    uint64_t   restoreTime1 __attribute__ ((packed));
+    uint64_t   restoreTime2 __attribute__ ((packed));
+    uint64_t   restoreTime3 __attribute__ ((packed));
 
     uint64_t   encryptEnd __attribute__ ((packed));
     uint64_t   deviceBase __attribute__ ((packed));
 
 #include <libkern/c++/OSObject.h>
 
 #define kIOPolledInterfaceSupportKey "IOPolledInterface"
+#define kIOPolledInterfaceActiveKey  "IOPolledInterfaceActive"
 
 enum
 {
 
     void ParentChangeNotifyInterestedDriversDidChange ( void );
     void ParentChangeTellCapabilityDidChange ( void );
     void ParentChangeAcknowledgePowerChange ( void );
-    void ParentChangeCancelIdleTimer( IOPMPowerStateIndex );
+    void ParentChangeRootChangeDown( void );
 
     void all_done ( void );
     void start_ack_timer ( void );
     void startSettleTimer( void );
     bool checkForDone ( void );
     bool responseValid ( uint32_t x, int pid );
-    void computeDesiredState ( unsigned long tempDesire = 0 );
+    void computeDesiredState( unsigned long tempDesire, bool computeOnly );
     void trackSystemSleepPreventers( IOPMPowerStateIndex, IOPMPowerStateIndex, IOPMPowerChangeFlags );
     void tellSystemCapabilityChange( uint32_t nextMS );
+    void restartIdleTimer( void );
 
        static void ack_timer_expired( thread_call_param_t, thread_call_param_t );
        static IOReturn actionAckTimerExpired(OSObject *, void *, void *, void *, void * );
 
 
     @constant kIOPMInitialDeviceState
     Indicates the initial power state for the device. If <code>initialPowerStateForDomainState()</code> returns a power state with this flag set in the capability field, then the initial power change is performed without calling the driver's <code>setPowerState()</code>.
+
+    @constant kIOPMRootDomainState
+    An indication that the power flags represent the state of the root power
+    domain. This bit must not be set in the IOPMPowerState structure.
+    Power Management may pass this bit to initialPowerStateForDomainState()
+    or powerStateForDomainState() to map from a global system state to the
+    desired device state.
 */
 typedef unsigned long IOPMPowerFlags;
 enum {
     kIOPMRestartCapability          = 0x00000080,
     kIOPMSleep                      = 0x00000001,
     kIOPMRestart                    = 0x00000080,
-    kIOPMInitialDeviceState         = 0x00000100
+    kIOPMInitialDeviceState         = 0x00000100,
+    kIOPMRootDomainState            = 0x00000200
 };
 
 /*
  */
 #define kIOPMDestroyFVKeyOnStandbyKey       "DestroyFVKeyOnStandby"
 
+/*******************************************************************************
+ *
+ * Properties that can control power management behavior
+ *
+ ******************************************************************************/
+
+/* kIOPMResetPowerStateOnWakeKey
+ * If an IOService publishes this key with the value of kOSBooleanTrue,
+ * then PM will disregard the influence from changePowerStateToPriv() or
+ * any activity tickles that occurred before system sleep when resolving
+ * the initial device power state on wake. Influences from power children
+ * and changePowerStateTo() are not eliminated. At the earliest opportunity
+ * upon system wake, PM will query the driver for a new power state to be
+ * installed as the initial changePowerStateToPriv() influence, by calling
+ * initialPowerStateForDomainState() with both kIOPMRootDomainState and
+ * kIOPMPowerOn flags set. The default implementation will always return
+ * the lowest power state. Drivers can override this default behavior to
+ * immediately raise the power state when there are work blocked on the
+ * power change, and cannot afford to wait until the next activity tickle.
+ * This property should be statically added to a driver's plist or set at
+ * runtime before calling PMinit().
+ */
+#define kIOPMResetPowerStateOnWakeKey       "IOPMResetPowerStateOnWake"
+
 /*******************************************************************************
  *
  * Driver PM Assertions
 
  */
 #define kIOPMUserWakeAlarmScheduledKey      "UserWakeAlarmScheduled"
 
+/* kIOPMDeepIdleSupportedKey
+ * Presence of this key indicates Deep Idle is supported on this platform.
+ * Key will always refer to a value of kOSBooleanTrue.
+ */
+#define kIOPMDeepIdleSupportedKey           "IOPMDeepIdleSupported"
+
 /*****************************************************************************
  *
  * System Sleep Policy
     kIOPMSleepTypeHibernate                 = 4,
     kIOPMSleepTypeStandby                   = 5,
     kIOPMSleepTypePowerOff                  = 6,
-    kIOPMSleepTypeLast                      = 7
+    kIOPMSleepTypeDeepIdle                  = 7,
+    kIOPMSleepTypeLast                      = 8
 };
 
 // System Sleep Flags
 
        if (kIOReturnSuccess != err)
            break;
 
+       vars->media = part;
+        next = part;
+       while (next)
+       {
+           next->setProperty(kIOPolledInterfaceActiveKey, kOSBooleanTrue);
+           next = next->getParentEntry(gIOServicePlane);
+       }
+
        *fileVars    = vars;
        *fileExtents = extentsData;
     
 static IOReturn
 IOHibernateDone(IOHibernateVars * vars)
 {
+    IORegistryEntry * next;
+
     hibernate_teardown(vars->page_list, vars->page_list_wired, vars->page_list_pal);
 
     if (vars->videoMapping)
         IOService::getPMRootDomain()->removeProperty(kIOHibernateGfxStatusKey);
     }
 
-
     if (vars->fileVars)
     {
+       if ((next = vars->fileVars->media)) do
+       {
+           next->removeProperty(kIOPolledInterfaceActiveKey);
+           next = next->getParentEntry(gIOServicePlane);
+       }
+       while (next);
        IOPolledFileClose(vars->fileVars);
     }
 
            gIOHibernateCurrentHeader->diag[0], gIOHibernateCurrentHeader->diag[1], 
            gIOHibernateCurrentHeader->diag[2], gIOHibernateCurrentHeader->diag[3]);
 
+    HIBLOG("restore times %qd, %qd, %qd ms, tsc 0x%qx scale 0x%x\n", 
+       (((gIOHibernateCurrentHeader->restoreTime1 * pal_rtc_nanotime_info.scale) >> 32) / 1000000),
+       (((gIOHibernateCurrentHeader->restoreTime2 * pal_rtc_nanotime_info.scale) >> 32) / 1000000),
+       (((gIOHibernateCurrentHeader->restoreTime3 * pal_rtc_nanotime_info.scale) >> 32) / 1000000),
+       gIOHibernateCurrentHeader->restoreTime1, pal_rtc_nanotime_info.scale);
+
     if ((kIOHibernateModeDiscardCleanActive | kIOHibernateModeDiscardCleanInactive) & gIOHibernateMode)
         hibernate_page_list_discard(vars->page_list);
 
                break;
 
            case kIOHibernateHandoffTypeMemoryMap:
+
+               clock_get_uptime(&allTime);
+
                hibernate_newruntime_map(data, handoff->bytecount, 
                                         gIOHibernateCurrentHeader->systemTableOffset);
+
+               clock_get_uptime(&endTime);
+           
+               SUB_ABSOLUTETIME(&endTime, &allTime);
+               absolutetime_to_nanoseconds(endTime, &nsec);
+           
+               HIBLOG("hibernate_newruntime_map time: %qd ms, ", nsec / 1000000ULL);
+
                break;
 
            case kIOHibernateHandoffTypeDeviceTree:
 
 struct IOPolledFileIOVars
 {
     struct kern_direct_file_io_ref_t * fileRef;
+    IORegistryEntry *                   media;
     class OSArray *                    pollers;
     IOByteCount                                blockSize;
     uint8_t *                                  buffer;
 
 #include <libkern/WKdm.h>
 #include "IOHibernateInternal.h"
 
-#if defined(__i386__) || defined(__x86_64__)
-#include <i386/pal_hibernate.h>
-#endif
+#include <machine/pal_hibernate.h>
 
 /*
 This code is linked into the kernel but part of the "__HIB" section, which means
 
 #if defined(__i386__) || defined(__x86_64__)
 
+#define rdtsc(lo,hi) \
+    __asm__ volatile("lfence; rdtsc; lfence" : "=a" (lo), "=d" (hi))
+
+static inline uint64_t rdtsc64(void)
+{
+    uint64_t lo, hi;
+    rdtsc(lo, hi);
+    return ((hi) << 32) | (lo);
+}
+
+#else
+
+static inline uint64_t rdtsc64(void)
+{
+    return (0);
+}
+
+#endif /* defined(__i386__) || defined(__x86_64__) */
+
+#if defined(__i386__) || defined(__x86_64__)
+
 #define DBGLOG 1
 
 #include <architecture/i386/pio.h>
     uint32_t handoffPages;
     uint32_t handoffPageCount;
 
+    uint64_t timeStart, time;
+    timeStart = rdtsc64();
+
     C_ASSERT(sizeof(IOHibernateImageHeader) == 512);
 
     headerPhys = ptoa_64(p1);
            if (!conflicts)
            {
 //              if (compressedSize)
+               time = rdtsc64();
                pageSum = store_one_page(gIOHibernateCurrentHeader->processorFlags,
                                         src, compressedSize, 0, ppnum);
+                gIOHibernateCurrentHeader->restoreTime2 += (rdtsc64() - time);
                if (stage != 2)
                    sum += pageSum;
                uncompressedPages++;
 
     // -- copy back conflicts
 
+    time = rdtsc64();
+
     pageListPage = copyPageListHeadPage;
     while (pageListPage)
     {
 
     pal_hib_patchup();
 
+    gIOHibernateCurrentHeader->restoreTime3 = (rdtsc64() - time);
+
     // -- image has been destroyed...
 
     gIOHibernateCurrentHeader->actualImage1Sum         = sum;
 
     gIOHibernateState = kIOHibernateStateWakingFromHibernate;
 
+    gIOHibernateCurrentHeader->restoreTime1 = (rdtsc64() - timeStart);
+
 #if CONFIG_SLEEP
 #if defined(__i386__) || defined(__x86_64__)
     typedef void (*ResetProc)(void);
 
        reserved->dp.memory = 0;
        UNLOCK;
     }
-
-    if ((kIOMemoryTypePhysical != type) && (kIOMemoryTypePhysical64 != type))
+    if ((kIOMemoryTypePhysical == type) || (kIOMemoryTypePhysical64 == type))
+    {
+       ioGMDData * dataP;
+       if (_memoryEntries && (dataP = getDataP(_memoryEntries)) && dataP->fMappedBase)
+       {
+           dataP->fMapper->iovmFree(atop_64(dataP->fMappedBase), _pages);
+           dataP->fMappedBase = 0;
+       }
+    }
+    else
     {
-       while (_wireCount)
-           complete();
+       while (_wireCount) complete();
     }
-    if (_memoryEntries)
-        _memoryEntries->release();
+
+    if (_memoryEntries) _memoryEntries->release();
 
     if (_ranges.v && !(kIOMemoryAsReference & _flags))
     {
 
 
         IOService::updateConsoleUsers(NULL, kIOMessageSystemWillSleep);
 
-        // Notify platform that sleep has begun
-        getPlatform()->callPlatformFunction(
-                        sleepMessagePEFunction, false,
-                        (void *)(uintptr_t) kIOMessageSystemWillSleep,
-                        NULL, NULL, NULL);
-
         // Two change downs are sent by IOServicePM. Ignore the 2nd.
         // But tellClientsWithResponse() must be called for both.
         ignoreTellChangeDown = true;
             DLOG("sysPowerDownHandler timeout %d s\n", (int) (params->maxWaitForReply / 1000 / 1000));
 #endif
 
+            // Notify platform that sleep has begun, after the early
+            // sleep policy evaluation.
+            getPlatform()->callPlatformFunction(
+                            sleepMessagePEFunction, false,
+                            (void *)(uintptr_t) kIOMessageSystemWillSleep,
+                            NULL, NULL, NULL);
+
             if ( !OSCompareAndSwap( 0, 1, &gSleepOrShutdownPending ) )
             {
                 // Purposely delay the ack and hope that shutdown occurs quickly.
     IOPMSystemSleepPolicyEntry  entries[];
 } __attribute__((packed));
 
+enum {
+    kIOPMSleepAttributeHibernateSetup   = 0x00000001,
+    kIOPMSleepAttributeHibernateSleep   = 0x00000002
+};
+
+static uint32_t
+getSleepTypeAttributes( uint32_t sleepType )
+{
+    static const uint32_t sleepTypeAttributes[ kIOPMSleepTypeLast ] =
+    {
+    /* invalid   */ 0,
+    /* abort     */ 0,
+    /* normal    */ 0,
+    /* safesleep */ kIOPMSleepAttributeHibernateSetup,
+    /* hibernate */ kIOPMSleepAttributeHibernateSetup | kIOPMSleepAttributeHibernateSleep,
+    /* standby   */ kIOPMSleepAttributeHibernateSetup | kIOPMSleepAttributeHibernateSleep,
+    /* poweroff  */ kIOPMSleepAttributeHibernateSetup | kIOPMSleepAttributeHibernateSleep,
+    /* deepidle  */ 0
+    };
+
+    if (sleepType >= kIOPMSleepTypeLast)
+        return 0;
+
+    return sleepTypeAttributes[sleepType];
+}
+
 bool IOPMrootDomain::evaluateSystemSleepPolicy(
     IOPMSystemSleepParameters * params, int sleepPhase, uint32_t * hibMode )
 {
             goto done;
         }
 
-        if ((params->sleepType >= kIOPMSleepTypeSafeSleep) &&
+        if ((getSleepTypeAttributes(params->sleepType) &
+             kIOPMSleepAttributeHibernateSetup) &&
             ((*hibMode & kIOHibernateModeOn) == 0))
         {
             *hibMode |= (kIOHibernateModeOn | kIOHibernateModeSleep);
                                   &hibernateMode))
     {
         if (!hibernateNoDefeat &&
-            (gEarlySystemSleepParams.sleepType == kIOPMSleepTypeNormalSleep))
+            ((getSleepTypeAttributes(gEarlySystemSleepParams.sleepType) &
+              kIOPMSleepAttributeHibernateSetup) == 0))
         {
-            // Disable hibernate setup for normal sleep
+            // skip hibernate setup
             hibernateDisabled = true;
         }
     }
     if (evaluateSystemSleepPolicy(¶ms, kIOPMSleepPhase2, &hibernateMode))
     {
         if ((hibernateDisabled || hibernateAborted) &&
-            (params.sleepType != kIOPMSleepTypeNormalSleep))
+            (getSleepTypeAttributes(params.sleepType) &
+             kIOPMSleepAttributeHibernateSetup))
         {
             // Final evaluation picked a state requiring hibernation,
             // but hibernate setup was skipped. Retry using the early
             paramsData->release();
         }
 
-        if (params.sleepType >= kIOPMSleepTypeHibernate)
+        if (getSleepTypeAttributes(params.sleepType) &
+            kIOPMSleepAttributeHibernateSleep)
         {
-            // Disable safe sleep to force the hibernate path
+            // Disable sleep to force hibernation
             gIOHibernateMode &= ~kIOHibernateModeSleep;
         }
     }
     uint32_t    changeFlags = *inOutChangeFlags;
     uint32_t    currentPowerState = (uint32_t) getPowerState();
 
-    if ((currentPowerState == powerState) ||
-        (changeFlags & kIOPMParentInitiated))
+    if (changeFlags & kIOPMParentInitiated)
     {
         // FIXME: cancel any parent change (unexpected)
         // Root parent is permanently pegged at max power,
             // Revert device desire from SLEEP->ON.
             changePowerStateToPriv(ON_STATE);
         }
+        else
+        {
+            // Broadcast power down
+            *inOutChangeFlags |= kIOPMRootChangeDown;
+        }
+    }
+    else if (powerState > currentPowerState)
+    {
+        if ((_currentCapability & kIOPMSystemCapabilityCPU) == 0)
+        {
+            // Broadcast power up when waking from sleep, but not for the
+            // initial power change at boot by checking for cpu capability.
+            *inOutChangeFlags |= kIOPMRootChangeUp;
+        }
     }
 }
 
 
             if ( minutesToIdleSleep > minutesToDisplayDim )
                 minutesDelta = minutesToIdleSleep - minutesToDisplayDim;
-            else if( minutesToIdleSleep == minutesToDisplayDim )
+            else if( minutesToIdleSleep <= minutesToDisplayDim )
                 minutesDelta = 1;
 
             if ((sleepSlider == 0) && (minutesToIdleSleep != 0))
 
         if( matches) {
 
             lockForArbitration();
-            if( 0 == (__state[0] & kIOServiceFirstPublishState))
+            if( 0 == (__state[0] & kIOServiceFirstPublishState)) {
+               getMetaClass()->addInstance(this);
                 deliverNotification( gIOFirstPublishNotification,
                                      kIOServiceFirstPublishState, 0xffffffff );
+            }
            LOCKREADNOTIFY();
             __state[1] &= ~kIOServiceNeedConfigState;
             __state[1] |= kIOServiceConfigState;
             }
 
            UNLOCKNOTIFY();
-           if (didRegister) {
-               getMetaClass()->addInstance(this);
-           }
             unlockForArbitration();
 
             if (keepGuessing && matches->getCount() && (kIOReturnSuccess == getResources()))
 
 // Globals
 //******************************************************************************
 
-static bool                  gIOPMInitialized   = false;
-static uint32_t              gIOPMBusyCount     = 0;
-static uint32_t              gIOPMWorkCount     = 0;
-static IOWorkLoop *          gIOPMWorkLoop      = 0;
-static IOPMRequestQueue *    gIOPMRequestQueue  = 0;
-static IOPMRequestQueue *    gIOPMReplyQueue    = 0;
-static IOPMWorkQueue *       gIOPMWorkQueue     = 0;
-static IOPMCompletionQueue * gIOPMFreeQueue     = 0;
-static IOPMRequest *         gIOPMRequest       = 0;
-static IOService *           gIOPMRootNode      = 0;
-static IOPlatformExpert *    gPlatform          = 0;
+static bool                  gIOPMInitialized       = false;
+static uint32_t              gIOPMBusyCount         = 0;
+static uint32_t              gIOPMWorkCount         = 0;
+static uint32_t              gIOPMTickleGeneration  = 0;
+static IOWorkLoop *          gIOPMWorkLoop          = 0;
+static IOPMRequestQueue *    gIOPMRequestQueue      = 0;
+static IOPMRequestQueue *    gIOPMReplyQueue        = 0;
+static IOPMWorkQueue *       gIOPMWorkQueue         = 0;
+static IOPMCompletionQueue * gIOPMFreeQueue         = 0;
+static IOPMRequest *         gIOPMRequest           = 0;
+static IOService *           gIOPMRootNode          = 0;
+static IOPlatformExpert *    gPlatform              = 0;
 
 static const OSSymbol *      gIOPMPowerClientDevice     = 0;
 static const OSSymbol *      gIOPMPowerClientDriver     = 0;
             gIOPMRootNode = this;
             fParentsKnowState = true;
         }
+        else if (getProperty(kIOPMResetPowerStateOnWakeKey) == kOSBooleanTrue)
+        {
+            fResetPowerStateOnWake = true;
+        }
 
         fAckTimer = thread_call_allocate(
                        &IOService::ack_timer_expired, (thread_call_param_t)this);
         PM_UNLOCK();
     }
 
-    // Tell idleTimerExpired() to ignore idle timer.
+    // Clear idle period to prevent idleTimerExpired() from servicing
+    // idle timer expirations.
+
     fIdleTimerPeriod = 0;
     if (fIdleTimer && thread_call_cancel(fIdleTimer))
         release();
 void IOService::adjustPowerState ( uint32_t clamp )
 {
        PM_ASSERT_IN_GATE();
-       computeDesiredState(clamp);
+       computeDesiredState(clamp, false);
        if (fControllingDriver && fParentsKnowState && inPlane(gIOPowerPlane))
        {
         IOPMPowerChangeFlags changeFlags = kIOPMSelfInitiated;
 
-        // Indicate that children desires were ignored, and do not ask
+        // Indicate that children desires must be ignored, and do not ask
         // apps for permission to drop power. This is used by root domain
         // for demand sleep.
 
     OSIterator *                iter;
     OSObject *                  next;
     IOPowerConnection *         connection;
-    IOPMPowerStateIndex  newPowerState;
+    IOPMPowerStateIndex  maxPowerState;
     IOPMPowerFlags              combinedPowerFlags;
        bool                             savedParentsKnowState;
        IOReturn                         result = IOPMAckImplied;
 
     if ( fControllingDriver && !fInitialPowerChange )
     {
-               newPowerState = fControllingDriver->maxCapabilityForDomainState(
+               maxPowerState = fControllingDriver->maxCapabilityForDomainState(
                                                        combinedPowerFlags);
 
-        // Absorb parent's kIOPMSynchronize flag.
+        // Use kIOPMSynchronize below instead of kIOPMRootBroadcastFlags
+        // to avoid propagating the root change flags if any service must
+        // change power state due to root's will-change notification.
+        // Root does not change power state for kIOPMSynchronize.
+        
         myChangeFlags = kIOPMParentInitiated | kIOPMDomainWillChange |
                         (parentChangeFlags & kIOPMSynchronize);
 
                result = startPowerChange(
                  /* flags        */    myChangeFlags,
-                 /* power state  */    newPowerState,
+                 /* power state  */    maxPowerState,
                                 /* domain flags */     combinedPowerFlags,
                                 /* connection   */     whichParent,
                                 /* parent flags */     parentPowerFlags);
        IOPowerConnection *      whichParent = (IOPowerConnection *) request->fArg1;
     IOPMPowerChangeFlags parentChangeFlags = (IOPMPowerChangeFlags)(uintptr_t) request->fArg2;
     IOPMPowerChangeFlags myChangeFlags;
-    IOPMPowerStateIndex  newPowerState;
-    IOPMPowerStateIndex  initialDesire;
+    IOPMPowerStateIndex  maxPowerState;
+    IOPMPowerStateIndex  initialDesire = 0;
+    bool                 computeDesire = false;
+    bool                 desireChanged = false;
        bool                             savedParentsKnowState;
        IOReturn                         result = IOPMAckImplied;
 
 
     if ( fControllingDriver )
        {
-               newPowerState = fControllingDriver->maxCapabilityForDomainState(
+               maxPowerState = fControllingDriver->maxCapabilityForDomainState(
                                                        fParentsCurrentPowerFlags);
 
         if (fInitialPowerChange)
         {
+            computeDesire = true;
             initialDesire = fControllingDriver->initialPowerStateForDomainState(
-                            fParentsCurrentPowerFlags);
-            computeDesiredState(initialDesire);
+                                fParentsCurrentPowerFlags);
         }
-        else if (fAdvisoryTickleUsed && (newPowerState > 0) &&
-                 ((parentChangeFlags & kIOPMSynchronize) == 0))
+        else if (parentChangeFlags & kIOPMRootChangeUp)
         {
-            // re-compute desired state in case advisory tickle was enabled
-            computeDesiredState();
+            if (fAdvisoryTickleUsed)
+            {
+                // On system wake, re-compute the desired power state since
+                // gIOPMAdvisoryTickleEnabled will change for a full wake,
+                // which is an input to computeDesiredState(). This is not
+                // necessary for a dark wake because powerChangeDone() will
+                // handle the dark to full wake case, but it does no harm.
+
+                desireChanged = true;
+            }
+
+            if (fResetPowerStateOnWake)
+            {
+                // Query the driver for the desired power state on system wake.
+                // Default implementation returns the lowest power state.
+
+                IOPMPowerStateIndex wakePowerState =
+                    fControllingDriver->initialPowerStateForDomainState(
+                        kIOPMRootDomainState | kIOPMPowerOn );
+
+                // fDesiredPowerState was adjusted before going to sleep
+                // with fDeviceDesire at min.
+
+                if (wakePowerState > fDesiredPowerState)
+                {
+                    // Must schedule a power adjustment if we changed the
+                    // device desire. That will update the desired domain
+                    // power on the parent power connection and ping the
+                    // power parent if necessary.
+
+                    updatePowerClient(gIOPMPowerClientDevice, wakePowerState);
+                    desireChanged = true;
+                }
+            }
         }
 
-        // Absorb parent's kIOPMSynchronize flag.
+        if (computeDesire || desireChanged)
+            computeDesiredState(initialDesire, false);
+
+        // Absorb and propagate parent's broadcast flags
         myChangeFlags = kIOPMParentInitiated | kIOPMDomainDidChange |
-                        (parentChangeFlags & kIOPMSynchronize);
+                        (parentChangeFlags & kIOPMRootBroadcastFlags);
 
                result = startPowerChange(
                                 /* flags        */     myChangeFlags,
-                 /* power state  */    newPowerState,
+                 /* power state  */    maxPowerState,
                                 /* domain flags */     fParentsCurrentPowerFlags,
                                 /* connection   */     whichParent,
                                 /* parent flags */     0);
        }
 
        // If the parent registers its power driver late, then this is the
-       // first opportunity to tell our parent about our desire. 
+       // first opportunity to tell our parent about our desire. Or if the
+    // child's desire changed during a parent change notify.
 
-       if (!savedParentsKnowState && fParentsKnowState)
+       if ((!savedParentsKnowState && fParentsKnowState) || desireChanged)
        {
-               PM_LOG1("%s::powerDomainDidChangeTo parentsKnowState = true\n",
-                       getName());
+               PM_LOG1("%s::powerDomainDidChangeTo parentsKnowState %d\n",
+                       getName(), fParentsKnowState);
                requestDomainPower( fDesiredPowerState );
        }
 
         {
             IOPMRequest *   cancelRequest;
 
-            cancelRequest = acquirePMRequest( this, kIOPMRequestTypeIdleCancel );
+            cancelRequest = acquirePMRequest( getPMRootDomain(), kIOPMRequestTypeIdleCancel );
             if (cancelRequest)
             {
-                getPMRootDomain()->submitPMRequest( cancelRequest );
+                submitPMRequest( cancelRequest );
             }
         }
 #endif
 // [private] computeDesiredState
 //*********************************************************************************
 
-void IOService::computeDesiredState ( unsigned long localClamp )
+void IOService::computeDesiredState( unsigned long localClamp, bool computeOnly )
 {
     OSIterator *               iter;
     OSObject *                 next;
             if (hasChildren && (client == gIOPMPowerClientChildProxy))
                 continue;
 
+            // Advisory tickles are irrelevant unless system is in full wake
             if (client == gIOPMPowerClientAdvisoryTickle &&
                 !gIOPMAdvisoryTickleEnabled)
                 continue;
         (uint32_t) localClamp, (uint32_t) fTempClampPowerState,
                (uint32_t) fCurrentPowerState, newPowerState);
 
-    // Restart idle timer if stopped and device desire has increased.
-    // Or if advisory desire exists.
-    
-    if (fIdleTimerStopped)
+    if (!computeOnly)
     {
-        if (fDeviceDesire > 0)
-        {
-            fIdleTimerStopped = false;
-            fActivityTickleCount = 0;
-            clock_get_uptime(&fIdleTimerStartTime);
-            start_PM_idle_timer();
-        }
-        else if (fHasAdvisoryDesire)
+        // Restart idle timer if possible when device desire has increased.
+        // Or if an advisory desire exists.
+
+        if (fIdleTimerPeriod && fIdleTimerStopped)
         {
-            fIdleTimerStopped = false;
-            start_PM_idle_timer();
+            restartIdleTimer();
         }
-    }
 
-    // Invalidate cached tickle power state when desires change, and not
-    // due to a tickle request.  This invalidation must occur before the
-    // power state change to minimize races.  We want to err on the side
-    // of servicing more activity tickles rather than dropping one when
-    // the device is in a low power state.
+        // Invalidate cached tickle power state when desires change, and not
+        // due to a tickle request. In case the driver has requested a lower
+        // power state, but the tickle is caching a higher power state which
+        // will drop future tickles until the cached value is lowered or in-
+        // validated. The invalidation must occur before the power transition
+        // to avoid dropping a necessary tickle.
 
-    if ((getPMRequestType() != kIOPMRequestTypeActivityTickle) &&
-        (fActivityTicklePowerState != kInvalidTicklePowerState))
-    {
-        IOLockLock(fActivityLock);
-        fActivityTicklePowerState = kInvalidTicklePowerState;
-        IOLockUnlock(fActivityLock);
+        if ((getPMRequestType() != kIOPMRequestTypeActivityTickle) &&
+            (fActivityTicklePowerState != kInvalidTicklePowerState))
+        {
+            IOLockLock(fActivityLock);
+            fActivityTicklePowerState = kInvalidTicklePowerState;
+            IOLockUnlock(fActivityLock);
+        }
     }
 }
 
 {
        IOPMRequest *   request;
        bool                    noPowerChange = true;
+    uint32_t        tickleFlags;
 
     if (!initialized)
         return true;    // no power change
                        fActivityTicklePowerState = stateNumber;
                        noPowerChange = false;
 
+            tickleFlags = kTickleTypeActivity | kTickleTypePowerRise;
                        request = acquirePMRequest( this, kIOPMRequestTypeActivityTickle );
                        if (request)
                        {
-                               request->fArg0 = (void *) stateNumber;  // power state
-                               request->fArg1 = (void *) true;         // power rise
-                request->fArg2 = (void *) false;        // regular tickle
+                               request->fArg0 = (void *) stateNumber;
+                               request->fArg1 = (void *) tickleFlags;
+                request->fArg2 = (void *) gIOPMTickleGeneration;
                                submitPMRequest(request);
                        }
                }
                        fAdvisoryTicklePowerState = stateNumber;
                        noPowerChange = false;
 
+            tickleFlags = kTickleTypeAdvisory | kTickleTypePowerRise;
                        request = acquirePMRequest( this, kIOPMRequestTypeActivityTickle );
                        if (request)
                        {
-                               request->fArg0 = (void *) stateNumber;  // power state
-                               request->fArg1 = (void *) true;         // power rise
-                request->fArg2 = (void *) true;         // advisory tickle
+                               request->fArg0 = (void *) stateNumber;
+                               request->fArg1 = (void *) tickleFlags;
+                request->fArg2 = (void *) gIOPMTickleGeneration;
                                submitPMRequest(request);
                        }
                }
 void IOService::handleActivityTickle ( IOPMRequest * request )
 {
        uint32_t ticklePowerState   = (uint32_t)(uintptr_t) request->fArg0;
-    bool     deviceWasActive    = (request->fArg1 == (void *) true);
-    bool     isRegularTickle    = (request->fArg2 == (void *) false);
+    uint32_t tickleFlags        = (uint32_t)(uintptr_t) request->fArg1;
+    uint32_t tickleGeneration   = (uint32_t)(uintptr_t) request->fArg2;
     bool     adjustPower        = false;
     
        PM_ASSERT_IN_GATE();
-    if (isRegularTickle)
+    if (fResetPowerStateOnWake && (tickleGeneration != gIOPMTickleGeneration))
+    {
+        // Drivers that don't want power restored on wake will drop any
+        // tickles that pre-dates the current system wake. The model is
+        // that each wake is a fresh start, with power state depressed
+        // until a new tickle or an explicit power up request from the
+        // driver. It is possible for the PM work loop to enter the
+        // system sleep path with tickle requests queued.
+
+        return;
+    }
+
+    if (tickleFlags & kTickleTypeActivity)
     {
-        if (deviceWasActive)
+        if (tickleFlags & kTickleTypePowerRise)
         {
             if ((ticklePowerState > fDeviceDesire) &&
                 (ticklePowerState < fNumberOfPowerStates))
     }
     else    // advisory tickle
     {
-        if (deviceWasActive)
+        if (tickleFlags & kTickleTypePowerRise)
         {
             if ((ticklePowerState == fDeviceUsablePowerState) &&
                 (ticklePowerState < fNumberOfPowerStates))
     if (pending) release();
 }
 
+//*********************************************************************************
+// [private] restartIdleTimer
+//*********************************************************************************
+
+void IOService::restartIdleTimer( void )
+{
+    if (fDeviceDesire != 0)
+    {
+        fIdleTimerStopped = false;
+        fActivityTickleCount = 0;
+        clock_get_uptime(&fIdleTimerStartTime);
+        start_PM_idle_timer();
+    }
+    else if (fHasAdvisoryDesire)
+    {
+        fIdleTimerStopped = false;
+        start_PM_idle_timer();
+    }
+    else
+    {
+        fIdleTimerStopped = true;
+    }
+}
+
 //*********************************************************************************
 // idle_timer_expired
 //*********************************************************************************
 {
        IOPMRequest *   request;
        bool                    restartTimer = true;
+    uint32_t        tickleFlags;
 
-    if ( !initialized || !fIdleTimerPeriod || fLockedFlags.PMStop )
+    if ( !initialized || !fIdleTimerPeriod || fIdleTimerStopped ||
+         fLockedFlags.PMStop )
         return;
 
        IOLockLock(fActivityLock);
                if (fActivityTicklePowerState > 0)
                        fActivityTicklePowerState--;
 
+        tickleFlags = kTickleTypeActivity | kTickleTypePowerDrop;
                request = acquirePMRequest( this, kIOPMRequestTypeActivityTickle );
                if (request)
                {
-                       request->fArg0 = (void *) 0;            // power state (irrelevant)
-                       request->fArg1 = (void *) false;        // timer expiration (not tickle)
-            request->fArg2 = (void *) false;    // regular tickle
+                       request->fArg0 = (void *) 0;    // irrelevant
+                       request->fArg1 = (void *) tickleFlags;
+            request->fArg2 = (void *) gIOPMTickleGeneration;
                        submitPMRequest( request );
 
                        // Do not restart timer until after the tickle request has been
         // Want new tickles to turn into pm request after we drop the lock
         fAdvisoryTicklePowerState = kInvalidTicklePowerState;
 
+        tickleFlags = kTickleTypeAdvisory | kTickleTypePowerDrop;
                request = acquirePMRequest( this, kIOPMRequestTypeActivityTickle );
                if (request)
                {
-                       request->fArg0 = (void *) 0;            // power state (irrelevant)
-                       request->fArg1 = (void *) false;        // timer expiration (not tickle)
-            request->fArg2 = (void *) true;     // advisory tickle
+                       request->fArg0 = (void *) 0;    // irrelevant
+                       request->fArg1 = (void *) tickleFlags;
+            request->fArg2 = (void *) gIOPMTickleGeneration;
                        submitPMRequest( request );
 
                        // Do not restart timer until after the tickle request has been
         }
         else if (fAdvisoryTickleUsed)
         {
-            // Not root domain and advisory tickle target
+            // Not root domain and advisory tickle target.
             // Re-adjust power after power tree sync at the 'did' pass
+            // to recompute desire and adjust power state between dark
+            // and full wake transitions. Root domain is responsible
+            // for calling setAdvisoryTickleEnable() before starting
+            // the kIOPMSynchronize power change.
+
             if (!fAdjustPowerScheduled &&
                 (fHeadNoteChangeFlags & kIOPMDomainDidChange))
             {
             if (fCurrentCapabilityFlags & kIOPMStaticPowerValid)
                 fCurrentPowerConsumption = powerStatePtr->staticPower;
 
+            if (fHeadNoteChangeFlags & kIOPMRootChangeDown)
+            {
+                // Bump tickle generation count once the entire tree is down
+                gIOPMTickleGeneration++;
+            }
+
             // inform subclass policy-maker
             if (fPCDFunctionOverride && fParentsKnowState &&
                 assertPMDriverCall(&callEntry, kIOPMADC_NoInactiveCheck))
     // parent's power change
     if ( fHeadNoteChangeFlags & kIOPMParentInitiated)
     {
+        if (fHeadNoteChangeFlags & kIOPMRootChangeDown)
+            ParentChangeRootChangeDown();
+    
         if (((fHeadNoteChangeFlags & kIOPMDomainWillChange) &&
              (fCurrentPowerState >= fHeadNotePowerState))   ||
                          ((fHeadNoteChangeFlags & kIOPMDomainDidChange)  &&
     }
 }
 
+//*********************************************************************************
+// [private] requestDomainPowerApplier
+//
+// Call requestPowerDomainState() on all power parents.
 //*********************************************************************************
 
 struct IOPMRequestDomainPowerContext {
 
 //*********************************************************************************
 // [private] requestDomainPower
+//
+// Called by a power child to broadcast its desired power state to all parents.
+// If the child self-initiates a power change, it must call this function to
+// allow its parents to adjust power state.
 //*********************************************************************************
 
 IOReturn IOService::requestDomainPower(
     if (IS_PM_ROOT)
         return kIOReturnSuccess;
 
-    // Fetch the input power flags for the requested power state.
+    // Fetch our input power flags for the requested power state.
     // Parent request is stated in terms of required power flags.
 
        requestPowerFlags = fPowerStates[ourPowerState].inputPowerFlags;
     }
     fPreviousRequestPowerFlags = requestPowerFlags;
 
+    // The results will be collected by fHeadNoteDomainTargetFlags
     context.child              = this;
     context.requestPowerFlags  = requestPowerFlags;
     fHeadNoteDomainTargetFlags = 0;
         maxPowerState = fControllingDriver->maxCapabilityForDomainState(
                             fHeadNoteDomainTargetFlags );
 
-        if (maxPowerState < fHeadNotePowerState)
+        if (maxPowerState < ourPowerState)
         {
             PM_LOG1("%s: power desired %u:0x%x got %u:0x%x\n",
                 getName(),
        PM_ASSERT_IN_GATE();
     OUR_PMLog( kPMLogStartParentChange, fHeadNotePowerState, fCurrentPowerState );
 
-    // Power domain is lowering power
-    if ( fHeadNotePowerState < fCurrentPowerState )
+    // Root power domain has transitioned to its max power state
+    if ((fHeadNoteChangeFlags & (kIOPMDomainDidChange | kIOPMRootChangeUp)) ==
+                                (kIOPMDomainDidChange | kIOPMRootChangeUp))
     {
-        // Piggy-back idle timer cancellation on a parent down
-        if (0 == fHeadNotePowerState)
-            ParentChangeCancelIdleTimer(fHeadNotePowerState);
-    
-               // TODO: redundant? See handlePowerDomainWillChangeTo()
-               setParentInfo( fHeadNoteParentFlags, fHeadNoteParentConnection, true );
+        // Restart the idle timer stopped by ParentChangeRootChangeDown()
+        if (fIdleTimerPeriod && fIdleTimerStopped)
+        {
+            restartIdleTimer();
+        }
+    }
 
+    // Power domain is forcing us to lower power
+    if ( fHeadNotePowerState < fCurrentPowerState )
+    {
         PM_ACTION_2(actionPowerChangeStart, fHeadNotePowerState, &fHeadNoteChangeFlags);
 
        // Tell apps and kernel clients
             ParentChangeTellCapabilityWillChange();
             return IOPMWillAckLater;
         }
-        else if (fHeadNoteChangeFlags & kIOPMSynchronize)
+        else if (fHeadNoteChangeFlags & kIOPMRootBroadcastFlags)
         {
-            // We do not need to change power state, but notify
-            // children to propagate tree synchronization.
+            // No need to change power state, but broadcast change
+            // to our children.
             fMachineState     = kIOPM_SyncNotifyDidChange;
             fDriverCallReason = kDriverCallInformPreChange;
             notifyChildren();
     return IOPMAckImplied;
 }
 
+//******************************************************************************
+// [private] ParentChangeRootChangeDown
+//
+// Root domain has finished the transition to the system sleep state. And all
+// drivers in the power plane should have powered down. Cancel the idle timer,
+// and also reset the device desire for those drivers that don't want power
+// automatically restored on wake.
+//******************************************************************************
+
+void IOService::ParentChangeRootChangeDown( void )
+{
+    // Always stop the idle timer before root power down
+    if (fIdleTimerPeriod && !fIdleTimerStopped)
+    {
+        fIdleTimerStopped = true;
+        if (fIdleTimer && thread_call_cancel(fIdleTimer))
+            release();
+    }
+
+    if (fResetPowerStateOnWake)
+    {
+        // Reset device desire down to the lowest power state.
+        // Advisory tickle desire is intentionally untouched since
+        // it has no effect until system is promoted to full wake.
+
+        if (fDeviceDesire != 0)
+        {
+            updatePowerClient(gIOPMPowerClientDevice, 0);
+            computeDesiredState(0, true);
+            PM_LOG1("%s: tickle desire removed\n", fName);
+        }
+
+        // Invalidate tickle cache so the next tickle will issue a request
+        IOLockLock(fActivityLock);
+        fDeviceWasActive = false;
+        fActivityTicklePowerState = kInvalidTicklePowerState;
+        IOLockUnlock(fActivityLock);
+
+        fIdleTimerMinPowerState = 0;
+    }
+    else if (fAdvisoryTickleUsed)
+    {
+        // Less aggressive mechanism to accelerate idle timer expiration
+        // before system sleep. May not always allow the driver to wake
+        // up from system sleep in the min power state.
+
+        AbsoluteTime    now;
+        uint64_t        nsec;
+        bool            dropTickleDesire = false;
+
+        if (fIdleTimerPeriod && !fIdleTimerIgnored &&
+            (fIdleTimerMinPowerState == 0) &&
+            (fDeviceDesire != 0))
+        {
+            IOLockLock(fActivityLock);
+
+            if (!fDeviceWasActive)
+            {
+                // No tickles since the last idle timer expiration.
+                // Safe to drop the device desire to zero.
+                dropTickleDesire = true;
+            }
+            else
+            {
+                // Was tickled since the last idle timer expiration,
+                // but not in the last minute.
+                clock_get_uptime(&now);
+                SUB_ABSOLUTETIME(&now, &fDeviceActiveTimestamp);
+                absolutetime_to_nanoseconds(now, &nsec);
+                if (nsec >= kNoTickleCancelWindow)
+                {
+                    dropTickleDesire = true;
+                }
+            }
+
+            if (dropTickleDesire)
+            {
+                // Force the next tickle to raise power state
+                fDeviceWasActive = false;
+                fActivityTicklePowerState = kInvalidTicklePowerState;
+            }
+
+            IOLockUnlock(fActivityLock);
+        }
+
+        if (dropTickleDesire)
+        {
+            // Advisory tickle desire is intentionally untouched since
+            // it has no effect until system is promoted to full wake.
+
+            updatePowerClient(gIOPMPowerClientDevice, 0);
+            computeDesiredState(0, true);
+            PM_LOG1("%s: tickle desire dropped\n", fName);
+        }
+    }
+}
+
 //*********************************************************************************
 // [private] ParentChangeTellPriorityClientsPowerDown
 //
     nub->release();
 }
 
-void IOService::ParentChangeCancelIdleTimer( IOPMPowerStateIndex newPowerState )
-{
-    AbsoluteTime    now;
-    uint64_t        nsec;
-    bool            cancel = false;
-
-    // No ready or idle timer not in use
-    if (!initialized || !fIdleTimerPeriod || fLockedFlags.PMStop ||
-        !fAdvisoryTickleUsed)
-        return;
-
-    // Not allowed to induce artifical idle timeout
-    if (fIdleTimerIgnored || fIdleTimerMinPowerState)
-        goto done;
-
-    // Idle timer already has no influence
-    if (!fDesiredPowerState || fIdleTimerStopped)
-        goto done;
-
-       IOLockLock(fActivityLock);
-
-    if (!fDeviceWasActive)
-    {
-        // No tickles since the last idle timer expiration.
-        // Safe to drop the device desire to zero.
-        cancel = true;
-    }
-    else
-    {
-        // Was tickled since the last idle timer expiration,
-        // but not in the last minute.
-        clock_get_uptime(&now);
-        SUB_ABSOLUTETIME(&now, &fDeviceActiveTimestamp);
-        absolutetime_to_nanoseconds(now, &nsec);
-        if (nsec >= kNoTickleCancelWindow)
-        {
-            cancel = true;
-        }
-    }
-
-    if (cancel)
-    {
-        // Force the next tickle to raise power state
-               fActivityTicklePowerState = kInvalidTicklePowerState;
-        fDeviceWasActive = false;
-    }
-
-       IOLockUnlock(fActivityLock);
-
-    if (cancel)
-    {
-        // cancel idle timer
-        if (fIdleTimer && thread_call_cancel(fIdleTimer))
-            release();
-
-        updatePowerClient(gIOPMPowerClientDevice, 0);
-        computeDesiredState();
-
-        fIdleTimerStopped = true;
-    }
-
-done:
-    OUR_PMLog( kPMLogStartParentChange, fHeadNotePowerState, fCurrentPowerState );
-    PM_LOG("%s::%s cancel=%d\n", fName, __FUNCTION__, cancel);
-}
-
 // MARK: -
 // MARK: Ack and Settle timers
 
 
 void IOService::startSettleTimer( void )
 {
+#if NOT_USEFUL
+    // This function is broken and serves no useful purpose since it never
+    // updates fSettleTimeUS to a non-zero value to stall the state machine,
+    // yet it starts a delay timer. It appears no driver relies on a delay
+    // from settleUpTime and settleDownTime in the power state table.
+
     AbsoluteTime        deadline;
     IOPMPowerStateIndex i;
     uint32_t            settleTime = 0;
         pending = thread_call_enter_delayed(fSettleTimer, deadline);
         if (pending) release();
     }
+#endif
 }
 
 //*********************************************************************************
 {
     int i;
 
+    if (fResetPowerStateOnWake && (domainState & kIOPMRootDomainState))
+    {
+        // Return lowest power state for any root power domain changes
+        return 0;
+    }
+
     if (fNumberOfPowerStates == 0 )
     {
         return 0;
        // Catch requests created by idleTimerExpired().
 
        if ((request->getType() == kIOPMRequestTypeActivityTickle) &&
-           (request->fArg1     == (void *) false))
+           (((uintptr_t) request->fArg1) & kTickleTypePowerDrop)  &&
+        fIdleTimerPeriod)
        {
-               // Idle timer expiration - power drop request completed.
-               // Restart the idle timer if deviceDesire can go lower, otherwise set
-               // a flag so we know to restart idle timer when fDeviceDesire > 0.
-
-               if (fDeviceDesire > 0)
-               {
-            fActivityTickleCount = 0;
-                       clock_get_uptime(&fIdleTimerStartTime);
-                       start_PM_idle_timer();
-               }
-        else if (fHasAdvisoryDesire)
-        {
-                       start_PM_idle_timer();
-        }
-               else
-        {
-                       fIdleTimerStopped = true;
-        }
+        restartIdleTimer();
     }
 
     // If the request is linked, then Work queue has already incremented its
                 fIsPreChange = false;
 
                 if (fHeadNoteChangeFlags & kIOPMParentInitiated)
+                {
                     fMachineState = kIOPM_SyncFinish;
+                }
                 else
+                {
+                    assert(IS_ROOT_DOMAIN);
                     fMachineState = kIOPM_SyncTellCapabilityDidChange;
+                }
 
                 fDriverCallReason = kDriverCallInformPostChange;
                 notifyChildren();
         case kIOPMRequestTypeSetIdleTimerPeriod:
             {
                 fIdleTimerPeriod = (uintptr_t) request->fArg0;
-
                 if ((false == fLockedFlags.PMStop) && (fIdleTimerPeriod > 0))
-                {
-                    fActivityTickleCount = 0;
-                    clock_get_uptime(&fIdleTimerStartTime);
-                    start_PM_idle_timer();
-                }
+                    restartIdleTimer();
             }
             break;
 
 
        fType = kIOPMRequestTypeInvalid;
 
+#if NOT_READY
        if (fCompletionAction)
        {
         fCompletionAction(fCompletionTarget, fCompletionParam, fCompletionStatus);
     }
+#endif
 
        if (fTarget)
        {
         fRequestNext = next;
         fRequestNext->fWorkWaitCount++;
 #if LOG_REQUEST_ATTACH
-        kprintf("Attached next: %p [0x%x] -> %p [0x%x, %u] %s\n",
+        PM_LOG("Attached next: %p [0x%x] -> %p [0x%x, %u] %s\n",
             this, (uint32_t) fType, fRequestNext,
             (uint32_t) fRequestNext->fType,
             (uint32_t) fRequestNext->fWorkWaitCount,
         if (fRequestNext->fWorkWaitCount)
             fRequestNext->fWorkWaitCount--;
 #if LOG_REQUEST_ATTACH
-        kprintf("Detached next: %p [0x%x] -> %p [0x%x, %u] %s\n",
+        PM_LOG("Detached next: %p [0x%x] -> %p [0x%x, %u] %s\n",
             this, (uint32_t) fType, fRequestNext,
             (uint32_t) fRequestNext->fType,
             (uint32_t) fRequestNext->fWorkWaitCount,
         fRequestRoot = root;
         fRequestRoot->fFreeWaitCount++;
 #if LOG_REQUEST_ATTACH
-        kprintf("Attached root: %p [0x%x] -> %p [0x%x, %u] %s\n",
+        PM_LOG("Attached root: %p [0x%x] -> %p [0x%x, %u] %s\n",
             this, (uint32_t) fType, fRequestRoot,
             (uint32_t) fRequestRoot->fType,
             (uint32_t) fRequestRoot->fFreeWaitCount,
         if (fRequestRoot->fFreeWaitCount)
             fRequestRoot->fFreeWaitCount--;
 #if LOG_REQUEST_ATTACH
-        kprintf("Detached root: %p [0x%x] -> %p [0x%x, %u] %s\n",
+        PM_LOG("Detached root: %p [0x%x] -> %p [0x%x, %u] %s\n",
             this, (uint32_t) fType, fRequestRoot,
             (uint32_t) fRequestRoot->fType,
             (uint32_t) fRequestRoot->fFreeWaitCount,
 
     // PM state lock.
     IOLock *                PMLock;
 
-    unsigned int            InitialPowerChange:1;
-    unsigned int            InitialSetPowerState:1;
-    unsigned int            DeviceOverrideEnabled:1;
-    unsigned int            DoNotPowerDown:1;
-    unsigned int            ParentsKnowState:1;
-    unsigned int            StrictTreeOrder:1;
-    unsigned int            IdleTimerStopped:1;
-    unsigned int            AdjustPowerScheduled:1;
-    unsigned int            IsPreChange:1;
-    unsigned int            DriverCallBusy:1;
-    unsigned int            PCDFunctionOverride:1;
-    unsigned int            IdleTimerIgnored:1;
-    unsigned int            HasAdvisoryDesire:1;
-    unsigned int            AdvisoryTickleUsed:1;
+    unsigned int            InitialPowerChange          :1;
+    unsigned int            InitialSetPowerState        :1;
+    unsigned int            DeviceOverrideEnabled       :1;
+    unsigned int            DoNotPowerDown              :1;
+    unsigned int            ParentsKnowState            :1;
+    unsigned int            StrictTreeOrder             :1;
+    unsigned int            IdleTimerStopped            :1;
+    unsigned int            AdjustPowerScheduled        :1;
+    
+    unsigned int            IsPreChange                 :1;
+    unsigned int            DriverCallBusy              :1;
+    unsigned int            PCDFunctionOverride         :1;
+    unsigned int            IdleTimerIgnored            :1;
+    unsigned int            HasAdvisoryDesire           :1;
+    unsigned int            AdvisoryTickleUsed          :1;
+    unsigned int            ResetPowerStateOnWake       :1;
 
     // Time of last device activity.
     AbsoluteTime            DeviceActiveTimestamp;
 #define fIdleTimerIgnored           pwrMgt->IdleTimerIgnored
 #define fHasAdvisoryDesire          pwrMgt->HasAdvisoryDesire
 #define fAdvisoryTickleUsed         pwrMgt->AdvisoryTickleUsed
+#define fResetPowerStateOnWake      pwrMgt->ResetPowerStateOnWake
 #define fDeviceActiveTimestamp      pwrMgt->DeviceActiveTimestamp
 #define fActivityLock               pwrMgt->ActivityLock
 #define fIdleTimerPeriod            pwrMgt->IdleTimerPeriod
 #define kIOPMSyncTellPowerDown      0x0400  // send the ask/will power off messages
 #define kIOPMSyncCancelPowerDown    0x0800  // sleep cancel for maintenance wake
 #define kIOPMInitialPowerChange     0x1000  // set for initial power change
+#define kIOPMRootChangeUp           0x2000  // Root power domain change up
+#define kIOPMRootChangeDown         0x4000  // Root power domain change down
+
+#define kIOPMRootBroadcastFlags     (kIOPMSynchronize  | \
+                                     kIOPMRootChangeUp | kIOPMRootChangeDown)
+
+// Activity tickle request flags
+#define kTickleTypePowerDrop        0x01
+#define kTickleTypePowerRise        0x02
+#define kTickleTypeActivity         0x04
+#define kTickleTypeAdvisory         0x08
 
 enum {
     kDriverCallInformPreChange,
 
                set $kgm_actint_framecount = 0
                while ($mysp != 0) && (($mysp & $stkmask) == 0) \
                      && ($mysp != $prevsp) \
-                     && ((((unsigned long) $mysp ^ (unsigned long) $prevsp) < 0x2000) \
+                     && ((((unsigned long) $mysp - (unsigned long) $prevsp) < 0x4000) \
                      || (((unsigned long)$mysp < ((unsigned long) ($kgm_thread->kernel_stack+kernel_stack_size))) \
                      && ((unsigned long)$mysp > (unsigned long) ($kgm_thread->kernel_stack)))) \
                      && ($kgm_actint_framecount < 128)
 
            if (superClassLink) {
                superClassLink->removeInstance(reserved->instances, true);
            }
+           IOLockLock(sAllClassesLock);
            reserved->instances->release();
            reserved->instances = 0;
+           IOLockUnlock(sAllClassesLock);
        }
     }
 
 
        .align 2, 0x90
        .globl __get_cpu_capabilities
 __get_cpu_capabilities:
-       movq    $(_COMM_PAGE_CPU_CAPABILITIES), %rax
-       movl    (%rax), %eax
+       movq    $(_COMM_PAGE_CPU_CAPABILITIES64), %rax
+       movq    (%rax), %rax
        ret
 
 #elif defined(__i386__)
        .align 2, 0x90
        .globl __get_cpu_capabilities
 __get_cpu_capabilities:
-       movl    _COMM_PAGE_CPU_CAPABILITIES, %eax
+       movl    _COMM_PAGE_CPU_CAPABILITIES64, %eax
+       movl    _COMM_PAGE_CPU_CAPABILITIES64+4, %edx
        ret
 
 #else
 
 
 osfmk/i386/startup64.c         standard
 osfmk/x86_64/idt64.s           standard
+
 
 
        machine_conf();
 
-#if NOTYET
-       ml_thrm_init();         /* Start thermal monitoring on this processor */
-#endif
-
        /*
         * Start the system.
         */
 
 #include <kern/kalloc.h>
 #include <sys/kdebug.h>
 
+#include <i386/machine_cpu.h>
+#include <i386/misc_protos.h>
+#include <i386/cpuid.h>
+
+#define PERMIT_PERMCHECK (0)
+
 diagWork        dgWork;
 uint64_t        lastRuptClear = 0ULL;
 
 typedef struct {
        uint64_t caperf;
        uint64_t cmperf;
-       uint64_t ccres[3];
-       uint64_t crtimes[4];
-       uint64_t citimes[4];
+       uint64_t ccres[6];
+       uint64_t crtimes[CPU_RTIME_BINS];
+       uint64_t citimes[CPU_ITIME_BINS];
        uint64_t crtime_total;
        uint64_t citime_total;
+       uint64_t cpu_idle_exits;
+       uint64_t cpu_insns;
+       uint64_t cpu_ucc;
+       uint64_t cpu_urc;
 } core_energy_stat_t;
 
 typedef struct {
-       uint64_t pkg_cres[2][4];
+       uint64_t pkg_cres[2][7];
        uint64_t pkg_power_unit;
        uint64_t pkg_energy;
+       uint64_t pp0_energy;
+       uint64_t pp1_energy;
+       uint64_t ddr_energy;
+       uint64_t llc_flushed_cycles;
+       uint64_t ring_ratio_instantaneous;
+       uint64_t IA_frequency_clipping_cause;
+       uint64_t GT_frequency_clipping_cause;
+       uint64_t pkg_idle_exits;
+       uint64_t pkg_rtimes[CPU_RTIME_BINS];
+       uint64_t pkg_itimes[CPU_ITIME_BINS];
+       uint64_t mbus_delay_time;
+       uint64_t mint_delay_time;
        uint32_t ncpus;
        core_energy_stat_t cest[];
 } pkg_energy_statistics_t;
 int 
 diagCall64(x86_saved_state_t * state)
 {
-       uint64_t        curpos, i, j;
-       uint64_t        selector, data;
-       uint64_t        currNap, durNap;
+       uint64_t        curpos, i, j;
+       uint64_t        selector, data;
+       uint64_t        currNap, durNap;
        x86_saved_state64_t     *regs;
        boolean_t       diagflag;
        uint32_t        rval = 0;
                pkes.pkg_cres[0][2] = ((uint64_t)c6h << 32) | c6l;
                pkes.pkg_cres[0][3] = ((uint64_t)c7h << 32) | c7l;
 
+               uint32_t cpumodel = cpuid_info()->cpuid_model;
+               boolean_t c8avail;
+               switch (cpumodel) {
+               case CPUID_MODEL_HASWELL_ULT:
+                       c8avail = TRUE;
+                       break;
+               default:
+                       c8avail = FALSE;
+                       break;
+               }
+               uint64_t c8r = ~0ULL, c9r = ~0ULL, c10r = ~0ULL;
+
+               if (c8avail) {
+                       rdmsr64_carefully(MSR_IA32_PKG_C8_RESIDENCY, &c8r);
+                       rdmsr64_carefully(MSR_IA32_PKG_C9_RESIDENCY, &c9r);
+                       rdmsr64_carefully(MSR_IA32_PKG_C10_RESIDENCY, &c10r);
+               }
+
+               pkes.pkg_cres[0][4] = c8r;
+               pkes.pkg_cres[0][5] = c9r;
+               pkes.pkg_cres[0][6] = c10r;
+
+               pkes.ddr_energy = ~0ULL;
+               rdmsr64_carefully(MSR_IA32_DDR_ENERGY_STATUS, &pkes.ddr_energy);
+               pkes.llc_flushed_cycles = ~0ULL;
+               rdmsr64_carefully(MSR_IA32_LLC_FLUSHED_RESIDENCY_TIMER, &pkes.llc_flushed_cycles);
+
+               pkes.ring_ratio_instantaneous = ~0ULL;
+               rdmsr64_carefully(MSR_IA32_RING_PERF_STATUS, &pkes.ring_ratio_instantaneous);
+
+               pkes.IA_frequency_clipping_cause = ~0ULL;
+               rdmsr64_carefully(MSR_IA32_IA_PERF_LIMIT_REASONS, &pkes.IA_frequency_clipping_cause);
+
+               pkes.GT_frequency_clipping_cause = ~0ULL;
+               rdmsr64_carefully(MSR_IA32_GT_PERF_LIMIT_REASONS, &pkes.GT_frequency_clipping_cause);
+
                rdmsr_carefully(MSR_IA32_PKG_POWER_SKU_UNIT, &pkg_unit_l, &pkg_unit_h);
                rdmsr_carefully(MSR_IA32_PKG_ENERGY_STATUS, &pkg_ecl, &pkg_ech);
-
                pkes.pkg_power_unit = ((uint64_t)pkg_unit_h << 32) | pkg_unit_l;
                pkes.pkg_energy = ((uint64_t)pkg_ech << 32) | pkg_ecl;
 
+               rdmsr_carefully(MSR_IA32_PP0_ENERGY_STATUS, &pkg_ecl, &pkg_ech);
+               pkes.pp0_energy = ((uint64_t)pkg_ech << 32) | pkg_ecl;
+
+               rdmsr_carefully(MSR_IA32_PP1_ENERGY_STATUS, &pkg_ecl, &pkg_ech);
+               pkes.pp1_energy = ((uint64_t)pkg_ech << 32) | pkg_ecl;
+
+               pkes.pkg_idle_exits = current_cpu_datap()->lcpu.package->package_idle_exits;
                pkes.ncpus = real_ncpus;
 
                (void) ml_set_interrupts_enabled(TRUE);
                mp_cpus_call(CPUMASK_ALL, ASYNC, cpu_powerstats, NULL);
                
                for (i = 0; i < real_ncpus; i++) {
+                       (void) ml_set_interrupts_enabled(FALSE);
+
                        cest.caperf = cpu_data_ptr[i]->cpu_aperf;
                        cest.cmperf = cpu_data_ptr[i]->cpu_mperf;
                        cest.ccres[0] = cpu_data_ptr[i]->cpu_c3res;
 
                        bcopy(&cpu_data_ptr[i]->cpu_rtimes[0], &cest.crtimes[0], sizeof(cest.crtimes));
                        bcopy(&cpu_data_ptr[i]->cpu_itimes[0], &cest.citimes[0], sizeof(cest.citimes));
+
                        cest.citime_total = cpu_data_ptr[i]->cpu_itime_total;
                        cest.crtime_total = cpu_data_ptr[i]->cpu_rtime_total;
+                       cest.cpu_idle_exits = cpu_data_ptr[i]->cpu_idle_exits;
+                       cest.cpu_insns = cpu_data_ptr[i]->cpu_cur_insns;
+                       cest.cpu_ucc = cpu_data_ptr[i]->cpu_cur_ucc;
+                       cest.cpu_urc = cpu_data_ptr[i]->cpu_cur_urc;
+                       (void) ml_set_interrupts_enabled(TRUE);
 
                        copyout(&cest, curpos, sizeof(cest));
                        curpos += sizeof(cest);
                rval = 1;
        }
                break;
+       case dgEnaPMC:
+       {
+               boolean_t enable = TRUE;
+               mp_cpus_call(CPUMASK_ALL, ASYNC, cpu_pmc_control, &enable);
+               rval = 1;
+       }
+       break;
 
 #if    DEBUG
        case dgGzallocTest:
                kfree(ptr, 1024);
                *ptr = 0x42;
        }
-               break;
+       break;
 #endif
 
-#if    defined(__x86_64__)             
+#if PERMIT_PERMCHECK   
        case    dgPermCheck:
        {
                (void) ml_set_interrupts_enabled(TRUE);
                rval = pmap_permissions_verify(kernel_pmap, kernel_map, 0, ~0ULL);
        }
                break;
-#endif /* __x86_64__*/
+#endif /* PERMIT_PERMCHECK */
 
        default:                /* Handle invalid ones */
                rval = 0;       /* Return an exception */
 
 void cpu_powerstats(__unused void *arg) {
        cpu_data_t *cdp = current_cpu_datap();
-       int cnum = cdp->cpu_number;
+       __unused int cnum = cdp->cpu_number;
        uint32_t cl = 0, ch = 0, mpl = 0, mph = 0, apl = 0, aph = 0;
 
        rdmsr_carefully(MSR_IA32_MPERF, &mpl, &mph);
        cdp->cpu_mperf = ((uint64_t)mph << 32) | mpl;
        cdp->cpu_aperf = ((uint64_t)aph << 32) | apl;
 
-       if (cnum & 1)
-               return;
+       uint64_t ctime = mach_absolute_time();
+       cdp->cpu_rtime_total += ctime - cdp->cpu_ixtime;
+       cdp->cpu_ixtime = ctime;
 
        rdmsr_carefully(MSR_IA32_CORE_C3_RESIDENCY, &cl, &ch);
        cdp->cpu_c3res = ((uint64_t)ch << 32) | cl;
 
        rdmsr_carefully(MSR_IA32_CORE_C7_RESIDENCY, &cl, &ch);
        cdp->cpu_c7res = ((uint64_t)ch << 32) | cl;
+       
+       uint64_t insns = read_pmc(FIXED_PMC0);
+       uint64_t ucc = read_pmc(FIXED_PMC1);
+       uint64_t urc = read_pmc(FIXED_PMC2);
+       cdp->cpu_cur_insns = insns;
+       cdp->cpu_cur_ucc = ucc;
+       cdp->cpu_cur_urc = urc;
+}
+
+void cpu_pmc_control(void *enablep) {
+       boolean_t enable = *(boolean_t *)enablep;
+       cpu_data_t      *cdp = current_cpu_datap();
+
+       if (enable) {
+               wrmsr64(0x38F, 0x70000000FULL);
+               wrmsr64(0x38D, 0x333);
+               set_cr4(get_cr4() | CR4_PCE);
+
+       } else {
+               wrmsr64(0x38F, 0);
+               wrmsr64(0x38D, 0);
+               set_cr4((get_cr4() & ~CR4_PCE));
+       }
+       cdp->cpu_fixed_pmcs_enabled = enable;
 }
 
 #define dgBind 18
 #define dgAcntg 20
 #define dgKlra 21
-#define dgKfree 22
+#define dgEnaPMC 22
 #define        dgWar 23
 #define dgNapStat 24
 #define dgRuptStat 25
 
 extern diagWork dgWork;
 
-
+#define FIXED_PMC (1 << 30)
+#define FIXED_PMC0 (FIXED_PMC)
+#define FIXED_PMC1 (FIXED_PMC | 1)
+#define FIXED_PMC2 (FIXED_PMC | 2)
+ 
+static inline uint64_t read_pmc(uint32_t counter)
+{
+       uint32_t lo = 0, hi = 0;
+       __asm__ volatile("rdpmc" : "=a" (lo), "=d" (hi) : "c" (counter));
+       return ((((uint64_t)hi) << 32) | ((uint64_t)lo));
+}
 #endif /* _DIAGNOSTICS_H_ */
 
 #endif /* KERNEL_PRIVATE */
 
 #include <i386/tsc.h>
 
 #include <kern/cpu_data.h>
+#include <kern/etimer.h>
+#include <kern/machine.h>
+#include <kern/timer_queue.h>
 #include <console/serial_protos.h>
 #include <machine/pal_routines.h>
 #include <vm/vm_page.h>
 #include <IOKit/IOHibernatePrivate.h>
 #endif
 #include <IOKit/IOPlatformExpert.h>
-
 #include <sys/kdebug.h>
 
 #if CONFIG_SLEEP
 extern void    acpi_sleep_cpu(acpi_sleep_callback, void * refcon);
-extern void acpi_wake_prot(void);
+extern void    acpi_wake_prot(void);
 #endif
 extern kern_return_t IOCPURunPlatformQuiesceActions(void);
 extern kern_return_t IOCPURunPlatformActiveActions(void);
 
 unsigned int           save_kdebug_enable = 0;
 static uint64_t                acpi_sleep_abstime;
+static uint64_t                acpi_idle_abstime;
+static uint64_t                acpi_wake_abstime;
+boolean_t              deep_idle_rebase = TRUE;
 
 #if CONFIG_SLEEP
 static void
 
 extern void                    slave_pstart(void);
 
+extern unsigned int            wake_nkdbufs;
 
 void
 acpi_sleep_kernel(acpi_sleep_callback func, void *refcon)
        if (lapic_probe())
                lapic_configure();
 
+       acpi_wake_abstime = mach_absolute_time();
+
        /* let the realtime clock reset */
        rtc_sleep_wakeup(acpi_sleep_abstime);
 
        kdebug_enable = save_kdebug_enable;
 
+       if (kdebug_enable == 0) {
+               if (wake_nkdbufs)
+                       start_kern_tracing(wake_nkdbufs, TRUE);
+       }
+
+       /* Reconfigure FP/SIMD unit */
+       init_fpu();
+       clear_ts();
+
        IOCPURunPlatformActiveActions();
 
        if (did_hibernate) {
        /* Restart timer interrupts */
        rtc_timer_start();
 
-       /* Reconfigure FP/SIMD unit */
-       init_fpu();
+
 
 #if HIBERNATION
 #ifdef __i386__
 #endif
 }
 
+/*
+ * acpi_idle_kernel is called by the ACPI Platform kext to request the kernel
+ * to idle the boot processor in the deepest C-state for S0 sleep. All slave
+ * processors are expected already to have been offlined in the deepest C-state.
+ *
+ * The contract with ACPI is that although the kernel is called with interrupts
+ * disabled, interrupts may need to be re-enabled to dismiss any pending timer
+ * interrupt. However, the callback function will be called once this has
+ * occurred and interrupts are guaranteed to be disabled at that time,
+ * and to remain disabled during C-state entry, exit (wake) and return
+ * from acpi_idle_kernel.
+ */
+void
+acpi_idle_kernel(acpi_sleep_callback func, void *refcon)
+{
+       boolean_t       istate = ml_get_interrupts_enabled();
+       
+       kprintf("acpi_idle_kernel, cpu=%d, interrupts %s\n",
+               cpu_number(), istate ? "enabled" : "disabled");
+
+       assert(cpu_number() == master_cpu);
+
+       /*
+        * Effectively set the boot cpu offline.
+        * This will stop further deadlines being set.
+        */
+       cpu_datap(master_cpu)->cpu_running = FALSE;
+
+       /* Cancel any pending deadline */
+       setPop(0);
+       while (lapic_is_interrupting(LAPIC_TIMER_VECTOR)) {
+               (void) ml_set_interrupts_enabled(TRUE);
+               setPop(0);
+               ml_set_interrupts_enabled(FALSE);
+       }
+
+       /*
+        * Call back to caller to indicate that interrupts will remain
+        * disabled while we deep idle, wake and return.
+        */ 
+       func(refcon);
+
+       acpi_idle_abstime = mach_absolute_time();
+
+       KERNEL_DEBUG_CONSTANT(
+               MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEEP_IDLE) | DBG_FUNC_START,
+               acpi_idle_abstime, deep_idle_rebase, 0, 0, 0);
+
+       /*
+        * Disable tracing during S0-sleep
+        * unless overridden by sysctl -w tsc.deep_idle_rebase=0
+        */
+       if (deep_idle_rebase) {
+               save_kdebug_enable = kdebug_enable;
+               kdebug_enable = 0;
+       }
+
+       /*
+        * Call into power-management to enter the lowest C-state.
+        * Note when called on the boot processor this routine will
+        * return directly when awoken.
+        */
+       pmCPUHalt(PM_HALT_SLEEP);
+
+       /*
+        * Get wakeup time relative to the TSC which has progressed.
+        * Then rebase nanotime to reflect time not progressing over sleep
+        * - unless overriden so that tracing can occur during deep_idle.
+        */ 
+       acpi_wake_abstime = mach_absolute_time();
+       if (deep_idle_rebase) {
+               rtc_sleep_wakeup(acpi_idle_abstime);
+               kdebug_enable = save_kdebug_enable;
+       }
+
+       cpu_datap(master_cpu)->cpu_running = TRUE;
+
+       KERNEL_DEBUG_CONSTANT(
+               MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEEP_IDLE) | DBG_FUNC_END,
+               acpi_wake_abstime, acpi_wake_abstime - acpi_idle_abstime, 0, 0, 0);
+ 
+       /* Like S3 sleep, turn on tracing if trace_wake boot-arg is present */ 
+       if (kdebug_enable == 0) {
+               if (wake_nkdbufs)
+                       start_kern_tracing(wake_nkdbufs, TRUE);
+       }
+
+       IOCPURunPlatformActiveActions();
+
+       /* Restart timer interrupts */
+       rtc_timer_start();
+}
+
 extern char real_mode_bootstrap_end[];
 extern char real_mode_bootstrap_base[];
 
 
 typedef void (*acpi_sleep_callback)(void * refcon);
 extern vm_offset_t acpi_install_wake_handler(void);
 extern void       acpi_sleep_kernel(acpi_sleep_callback func, void * refcon);
+extern void       acpi_idle_kernel(acpi_sleep_callback func, void * refcon);
 void install_real_mode_bootstrap(void *prot_entry);
 #endif /* ASSEMBLER */
 
 
 
 char   *commPagePtr32 = NULL;          // virtual addr in kernel map of 32-bit commpage
 char   *commPagePtr64 = NULL;          // ...and of 64-bit commpage
-char   *commPageTextPtr32 = NULL;              // virtual addr in kernel map of 32-bit commpage
-char   *commPageTextPtr64 = NULL;              // ...and of 64-bit commpage
-uint32_t     _cpu_capabilities = 0;          // define the capability vector
+char   *commPageTextPtr32 = NULL;      // virtual addr in kernel map of 32-bit commpage
+char   *commPageTextPtr64 = NULL;      // ...and of 64-bit commpage
 
-int    noVMX = 0;              /* if true, do not set kHasAltivec in ppc _cpu_capabilities */
+uint64_t     _cpu_capabilities = 0;     // define the capability vector
 
 typedef uint32_t commpage_address_t;
 
-static commpage_address_t      next;                   // next available address in comm page
-static commpage_address_t      cur_routine;            // comm page address of "current" routine
-static boolean_t               matched;                // true if we've found a match for "current" routine
+static commpage_address_t      next;   // next available address in comm page
 
 static char    *commPagePtr;           // virtual addr in kernel map of commpage we are working on
 static commpage_address_t      commPageBaseOffset; // subtract from 32-bit runtime address to get offset in virtual commpage in kernel map
 static void
 commpage_init_cpu_capabilities( void )
 {
-       uint32_t bits;
+       uint64_t bits;
        int cpus;
        ml_cpu_info_t cpu_info;
 
        }
        cpus = commpage_cpus();                 // how many CPUs do we have
 
-       if (cpus == 1)
-               bits |= kUP;
-
        bits |= (cpus << kNumCPUsShift);
 
        bits |= kFastThreadLocalStorage;        // we use %gs for TLS
 
-       if (cpu_mode_is64bit())                 // k64Bit means processor is 64-bit capable
-               bits |= k64Bit;
-
-       if (tscFreq <= SLOW_TSC_THRESHOLD)      /* is TSC too slow for _commpage_nanotime?  */
-               bits |= kSlow;
-
-       bits |= (cpuid_features() & CPUID_FEATURE_AES) ? kHasAES : 0;
-
-       bits |= (cpuid_features() & CPUID_FEATURE_F16C) ? kHasF16C : 0;
-       bits |= (cpuid_features() & CPUID_FEATURE_RDRAND) ? kHasRDRAND : 0;
-       bits |= ((cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_ENFSTRG) &&
-                (rdmsr64(MSR_IA32_MISC_ENABLE) & 1ULL )) ? kHasENFSTRG : 0;
-
+#define setif(_bits, _bit, _condition) \
+       if (_condition) _bits |= _bit
+
+       setif(bits, kUP,         cpus == 1);
+       setif(bits, k64Bit,      cpu_mode_is64bit());
+       setif(bits, kSlow,       tscFreq <= SLOW_TSC_THRESHOLD);
+
+       setif(bits, kHasAES,     cpuid_features() &
+                                       CPUID_FEATURE_AES);
+       setif(bits, kHasF16C,    cpuid_features() &
+                                       CPUID_FEATURE_F16C);
+       setif(bits, kHasRDRAND,  cpuid_features() &
+                                       CPUID_FEATURE_RDRAND);
+       setif(bits, kHasFMA,     cpuid_features() &
+                                       CPUID_FEATURE_FMA);
+
+       setif(bits, kHasBMI1,    cpuid_leaf7_features() &
+                                       CPUID_LEAF7_FEATURE_BMI1);
+       setif(bits, kHasBMI2,    cpuid_leaf7_features() &
+                                       CPUID_LEAF7_FEATURE_BMI2);
+       setif(bits, kHasRTM,     cpuid_leaf7_features() &
+                                       CPUID_LEAF7_FEATURE_RTM);
+       setif(bits, kHasHLE,     cpuid_leaf7_features() &
+                                       CPUID_LEAF7_FEATURE_HLE);
+       setif(bits, kHasAVX2_0,  cpuid_leaf7_features() &
+                                       CPUID_LEAF7_FEATURE_AVX2);
+       
+       uint64_t misc_enable = rdmsr64(MSR_IA32_MISC_ENABLE);
+       setif(bits, kHasENFSTRG, (misc_enable & 1ULL) &&
+                                (cpuid_leaf7_features() &
+                                       CPUID_LEAF7_FEATURE_ENFSTRG));
+       
        _cpu_capabilities = bits;               // set kernel version for use by drivers etc
 }
 
-int
+uint64_t
 _get_cpu_capabilities(void)
 {
        return _cpu_capabilities;
  */
 static void
 commpage_stuff_routine(
-    commpage_descriptor        *rd     )
+    commpage_descriptor *rd     )
 {
-    uint32_t           must,cant;
-    
-    if (rd->commpage_address != cur_routine) {
-        if ((cur_routine!=0) && (matched==0))
-            panic("commpage no match for last, next address %08x", rd->commpage_address);
-        cur_routine = rd->commpage_address;
-        matched = 0;
-    }
-    
-    must = _cpu_capabilities & rd->musthave;
-    cant = _cpu_capabilities & rd->canthave;
-    
-    if ((must == rd->musthave) && (cant == 0)) {
-        if (matched)
-            panic("commpage multiple matches for address %08x", rd->commpage_address);
-        matched = 1;
-        
-        commpage_stuff(rd->commpage_address,rd->code_address,rd->code_length);
-       }
+       commpage_stuff(rd->commpage_address,rd->code_address,rd->code_length);
 }
 
 /* Fill in the 32- or 64-bit commpage.  Called once for each.
        const char*     signature,      // "commpage 32-bit" or "commpage 64-bit"
        vm_prot_t       uperm)
 {
-       uint8_t c1;
-       short   c2;
-       int         c4;
-       uint64_t c8;
+       uint8_t         c1;
+       uint16_t        c2;
+       int             c4;
+       uint64_t        c8;
        uint32_t        cfamily;
        short   version = _COMM_PAGE_THIS_VERSION;
 
        next = 0;
-       cur_routine = 0;
        commPagePtr = (char *)commpage_allocate( submap, (vm_size_t) area_used, uperm );
        *kernAddressPtr = commPagePtr;                          // save address either in commPagePtr32 or 64
        commPageBaseOffset = base_offset;
 
        /* Stuff in the constants.  We move things into the comm page in strictly
        * ascending order, so we can check for overlap and panic if so.
+       * Note: the 32-bit cpu_capabilities vector is retained in addition to
+       * the expanded 64-bit vector.
        */
-       commpage_stuff(_COMM_PAGE_SIGNATURE,signature,(int)strlen(signature));
+       commpage_stuff(_COMM_PAGE_SIGNATURE,signature,(int)MIN(_COMM_PAGE_SIGNATURELEN, strlen(signature)));
+       commpage_stuff(_COMM_PAGE_CPU_CAPABILITIES64,&_cpu_capabilities,sizeof(_cpu_capabilities));
        commpage_stuff(_COMM_PAGE_VERSION,&version,sizeof(short));
-       commpage_stuff(_COMM_PAGE_CPU_CAPABILITIES,&_cpu_capabilities,sizeof(int));
+       commpage_stuff(_COMM_PAGE_CPU_CAPABILITIES,&_cpu_capabilities,sizeof(uint32_t));
 
        c2 = 32;  // default
        if (_cpu_capabilities & kCache64)
        else if (_cpu_capabilities & kCache128)
                c2 = 128;
        commpage_stuff(_COMM_PAGE_CACHE_LINESIZE,&c2,2);
-       
+
        c4 = MP_SPIN_TRIES;
        commpage_stuff(_COMM_PAGE_SPIN_COUNT,&c4,4);
 
 void commpage_text_populate( void ){
        commpage_descriptor **rd;
        
-       next =0;
-       cur_routine=0;
+       next = 0;
        commPagePtr = (char *) commpage_allocate(commpage_text32_map, (vm_size_t) _COMM_PAGE_TEXT_AREA_USED, VM_PROT_READ | VM_PROT_EXECUTE);
        commPageTextPtr32 = commPagePtr;
        
        for (rd = commpage_32_routines; *rd != NULL; rd++) {
                commpage_stuff_routine(*rd);
        }
-       if (!matched)
-               panic(" commpage_text no match for last routine ");
 
 #ifndef __LP64__
        pmap_commpage32_init((vm_offset_t) commPageTextPtr32, _COMM_PAGE_TEXT_START, 
 #endif 
 
        if (_cpu_capabilities & k64Bit) {
-               next =0;
-               cur_routine=0;
+               next = 0;
                commPagePtr = (char *) commpage_allocate(commpage_text64_map, (vm_size_t) _COMM_PAGE_TEXT_AREA_USED, VM_PROT_READ | VM_PROT_EXECUTE);
                commPageTextPtr64 = commPagePtr;
 
 #endif 
        }
 
-       if (!matched)
-               panic(" commpage_text no match for last routine ");
-
        if (next > _COMM_PAGE_TEXT_END) 
                panic("commpage text overflow: next=0x%08x, commPagePtr=%p", next, commPagePtr); 
 
 }
 
-/* Update commpage nanotime information.  Note that we interleave
- * setting the 32- and 64-bit commpages, in order to keep nanotime more
- * nearly in sync between the two environments.
+/* Update commpage nanotime information.
  *
  * This routine must be serialized by some external means, ie a lock.
  */
                panic("nanotime trouble 1");    /* possibly not serialized */
        if ( ns_base < p32->nt_ns_base )
                panic("nanotime trouble 2");
-       if ((shift != 32) && ((_cpu_capabilities & kSlow)==0) )
+       if ((shift != 0) && ((_cpu_capabilities & kSlow)==0) )
                panic("nanotime trouble 3");
                
        next_gen = ++generation;
        cp = commPagePtr32;
        if ( cp ) {
                cp += (_COMM_PAGE_MEMORY_PRESSURE - _COMM_PAGE32_BASE_ADDRESS);
-               ip = (uint32_t*) cp;
+               ip = (uint32_t*) (void *) cp;
                *ip = (uint32_t) pressure;
        }
        
        cp = commPagePtr64;
        if ( cp ) {
                cp += (_COMM_PAGE_MEMORY_PRESSURE - _COMM_PAGE32_START_ADDRESS);
-               ip = (uint32_t*) cp;
+               ip = (uint32_t*) (void *) cp;
                *ip = (uint32_t) pressure;
        }
 
        cp = commPagePtr32;
        if ( cp ) {
                cp += (_COMM_PAGE_SPIN_COUNT - _COMM_PAGE32_BASE_ADDRESS);
-               ip = (uint32_t*) cp;
+               ip = (uint32_t*) (void *) cp;
                *ip = (uint32_t) count;
        }
        
        cp = commPagePtr64;
        if ( cp ) {
                cp += (_COMM_PAGE_SPIN_COUNT - _COMM_PAGE32_START_ADDRESS);
-               ip = (uint32_t*) cp;
+               ip = (uint32_t*) (void *) cp;
                *ip = (uint32_t) count;
        }
 
 
 .align alignment, 0x90                                         ;\
 L ## label ## :
 
-#define        COMMPAGE_DESCRIPTOR(label,address,must,cant)    \
+#define        COMMPAGE_DESCRIPTOR(label,address)                      \
 L ## label ## _end:                                            ;\
 .set L ## label ## _size, L ## label ## _end - L ## label      ;\
 .const_data                                                    ;\
     COMMPAGE_DESCRIPTOR_FIELD_POINTER  L ## label              ;\
     .long                              L ## label ## _size     ;\
     .long                              address                 ;\
-    .long                              must                    ;\
-    .long                              cant                    ;\
 .text
 
 
     void               *code_address;                          // address of code
     uint32_t           code_length;                            // length in bytes
     uint32_t           commpage_address;                       // put at this address (_COMM_PAGE_BCOPY etc)
-    uint32_t           musthave;                               // _cpu_capability bits we must have
-    uint32_t           canthave;                               // _cpu_capability bits we can't have
 } commpage_descriptor;
 
 
 
        movl    $(-58),%eax     /* 58 = pfz_exit */
        xorl    %ebx,%ebx       // clear "preemption pending" flag
        sysenter
-COMMPAGE_DESCRIPTOR(preempt,_COMM_PAGE_PREEMPT,0,0)
+COMMPAGE_DESCRIPTOR(preempt,_COMM_PAGE_PREEMPT)
 
 
 /* Subroutine to back off if we cannot get the spinlock.  Called
        cmpl    $0,8(%edi)      // sniff the lockword
        jnz     1b              // loop if still taken
        ret                     // lockword is free, so reenter PFZ
-COMMPAGE_DESCRIPTOR(backoff,_COMM_PAGE_BACKOFF,0,0)
+COMMPAGE_DESCRIPTOR(backoff,_COMM_PAGE_BACKOFF)
 
 
 /* Preemption-free-zone routine to FIFO Enqueue:
        movl        %esi,4(%edi)    // new element becomes last in q
        movl        $0,8(%edi)      // unlock spinlock
        ret
-COMMPAGE_DESCRIPTOR(pfz_enqueue,_COMM_PAGE_PFZ_ENQUEUE,0,0)
+COMMPAGE_DESCRIPTOR(pfz_enqueue,_COMM_PAGE_PFZ_ENQUEUE)
 
 
 /* Preemption-free-zone routine to FIFO Dequeue:
 4:
        movl        $0,8(%edi)      // unlock spinlock
        ret
-COMMPAGE_DESCRIPTOR(pfz_dequeue,_COMM_PAGE_PFZ_DEQUEUE,0,0)
+COMMPAGE_DESCRIPTOR(pfz_dequeue,_COMM_PAGE_PFZ_DEQUEUE)
 
 
 
        popq    %rcx
        popq    %rax
        ret
-COMMPAGE_DESCRIPTOR(preempt_64,_COMM_PAGE_PREEMPT,0,0)
+COMMPAGE_DESCRIPTOR(preempt_64,_COMM_PAGE_PREEMPT)
 
 
 /* Subroutine to back off if we cannot get the spinlock.  Called
        cmpl    $0,16(%rdi)     // sniff the lockword
        jnz     1b              // loop if still taken
        ret                     // lockword is free, so reenter PFZ
-COMMPAGE_DESCRIPTOR(backoff_64,_COMM_PAGE_BACKOFF,0,0)
+COMMPAGE_DESCRIPTOR(backoff_64,_COMM_PAGE_BACKOFF)
 
 
 /* Preemption-free-zone routine to FIFO Enqueue:
        movq        %rsi,8(%rdi)    // new element becomes last in q
        movl        $0,16(%rdi)     // unlock spinlock
        ret
-COMMPAGE_DESCRIPTOR(pfz_enqueue_64,_COMM_PAGE_PFZ_ENQUEUE,0,0)
+COMMPAGE_DESCRIPTOR(pfz_enqueue_64,_COMM_PAGE_PFZ_ENQUEUE)
 
 
 
 4:
        movl        $0,16(%rdi)     // unlock spinlock
        ret
-COMMPAGE_DESCRIPTOR(pfz_dequeue_64,_COMM_PAGE_PFZ_DEQUEUE,0,0)
+COMMPAGE_DESCRIPTOR(pfz_dequeue_64,_COMM_PAGE_PFZ_DEQUEUE)
 
        orl     $0x00180000,%eax            // copy 24 bytes of arguments in trampoline
        xorl    %ebx,%ebx                   // clear preemption flag
        sysenter
-COMMPAGE_DESCRIPTOR(pfz_mutex_lock,_COMM_PAGE_PFZ_MUTEX_LOCK,0,0)
+COMMPAGE_DESCRIPTOR(pfz_mutex_lock,_COMM_PAGE_PFZ_MUTEX_LOCK)
 
 
 
        movl    $PTHRW_STATUS_SYSCALL,%eax  // we made syscall
        popq    %rbp
        ret
-COMMPAGE_DESCRIPTOR(pfz_mutex_lock_64,_COMM_PAGE_PFZ_MUTEX_LOCK,0,0)
+COMMPAGE_DESCRIPTOR(pfz_mutex_lock_64,_COMM_PAGE_PFZ_MUTEX_LOCK)
 
 
 {
        cpu_data_t      *cdp = current_cpu_datap();
 
-       i386_deactivate_cpu();
-
        PE_cpu_machine_quiesce(cdp->cpu_id);
 
        cpu_thread_halt();
 
 #define        kSlow                           0x00004000      /* tsc < nanosecond */
 #define        kUP                             0x00008000      /* set if (kNumCPUs == 1) */
 #define        kNumCPUs                        0x00FF0000      /* number of CPUs (see _NumCPUs() below) */
+#define        kNumCPUsShift                   16
 #define        kHasAVX1_0                      0x01000000
 #define        kHasRDRAND                      0x02000000
 #define        kHasF16C                        0x04000000
 #define        kHasENFSTRG                     0x08000000
-#define        kNumCPUsShift                   16              /* see _NumCPUs() below */
+#define        kHasFMA                         0x10000000
+#define        kHasAVX2_0                      0x20000000
+#define        kHasBMI1                        0x40000000
+#define        kHasBMI2                        0x80000000
+/* Extending into 64-bits from here: */ 
+#define        kHasRTM                 0x0000000100000000ULL
+#define        kHasHLE                 0x0000000200000000ULL
+
 
 #ifndef        __ASSEMBLER__
 #include <sys/cdefs.h>
 
 __BEGIN_DECLS
-extern int  _get_cpu_capabilities( void );
+extern uint64_t  _get_cpu_capabilities( void );
 __END_DECLS
 
 inline static
 int _NumCPUs( void )
 {
-       return (_get_cpu_capabilities() & kNumCPUs) >> kNumCPUsShift;
+       return (int) (_get_cpu_capabilities() & kNumCPUs) >> kNumCPUsShift;
 }
 
 #endif /* __ASSEMBLER__ */
 
 /* data in the comm page */
  
-#define _COMM_PAGE_SIGNATURE           (_COMM_PAGE_START_ADDRESS+0x000)        /* first few bytes are a signature */
+#define _COMM_PAGE_SIGNATURE           (_COMM_PAGE_START_ADDRESS+0x000)        /* first 16 bytes are a signature */
+#define _COMM_PAGE_SIGNATURELEN                (0x10)
+#define _COMM_PAGE_CPU_CAPABILITIES64  (_COMM_PAGE_START_ADDRESS+0x010)        /* uint64_t _cpu_capabilities */
+#define _COMM_PAGE_UNUSED              (_COMM_PAGE_START_ADDRESS+0x018)        /* 6 unused bytes */
 #define _COMM_PAGE_VERSION             (_COMM_PAGE_START_ADDRESS+0x01E)        /* 16-bit version# */
-#define _COMM_PAGE_THIS_VERSION                12                                      /* version of the commarea format */
+#define _COMM_PAGE_THIS_VERSION                13                                      /* in ver 13, _COMM_PAGE_NT_SHIFT defaults to 0 (was 32) */
   
-#define _COMM_PAGE_CPU_CAPABILITIES    (_COMM_PAGE_START_ADDRESS+0x020)        /* uint32_t _cpu_capabilities */
+#define _COMM_PAGE_CPU_CAPABILITIES    (_COMM_PAGE_START_ADDRESS+0x020)        /* uint32_t _cpu_capabilities (retained for compatibility) */
 #define _COMM_PAGE_NCPUS               (_COMM_PAGE_START_ADDRESS+0x022)        /* uint8_t number of configured CPUs (hw.logicalcpu at boot time) */
-#define _COMM_PAGE_UNUSED0                     (_COMM_PAGE_START_ADDRESS+0x024)        /* 2 unused bytes, reserved for future expansion of cpu_capabilities */
+#define _COMM_PAGE_UNUSED0             (_COMM_PAGE_START_ADDRESS+0x024)        /* 2 unused bytes, previouly reserved for expansion of cpu_capabilities */
 #define _COMM_PAGE_CACHE_LINESIZE      (_COMM_PAGE_START_ADDRESS+0x026)        /* uint16_t cache line size */
 
 #define _COMM_PAGE_SCHED_GEN           (_COMM_PAGE_START_ADDRESS+0x028)        /* uint32_t scheduler generation number (count of pre-emptions) */
 
 
 typedef        uint16_t        pcid_t;
 typedef        uint8_t         pcid_ref_t;
+
+#define CPU_RTIME_BINS (12)
+#define CPU_ITIME_BINS (CPU_RTIME_BINS)
+
 /*
  * Per-cpu data.
  *
        int                     cpu_prior_signals;      /* Last set of events,
                                                         * debugging
                                                         */
-       int                     cpu_mcount_off;         /* mcount recursion */
        ast_t                   cpu_pending_ast;
-       int                     cpu_type;
-       int                     cpu_subtype;
-       int                     cpu_threadtype;
-       int                     cpu_running;
+       volatile int            cpu_running;
+       boolean_t               cpu_fixed_pmcs_enabled;
        rtclock_timer_t         rtclock_timer;
        boolean_t               cpu_is64bit;
        volatile addr64_t       cpu_active_cr3 __attribute((aligned(64)));
        struct fake_descriptor  *cpu_ldtp;
        cpu_desc_index_t        cpu_desc_index;
        int                     cpu_ldt;
-       boolean_t               cpu_iflag;
-       boolean_t               cpu_boot_complete;
-       int                     cpu_hibernate;
 #if NCOPY_WINDOWS > 0
        vm_offset_t             cpu_copywindow_base;
        uint64_t                *cpu_copywindow_pdp;
        vm_offset_t             cpu_physwindow_base;
        uint64_t                *cpu_physwindow_ptep;
 #endif
-       void                    *cpu_hi_iss;
 
 #define HWINTCNT_SIZE 256
        uint32_t                cpu_hwIntCnt[HWINTCNT_SIZE];    /* Interrupt counts */
+       uint64_t                cpu_hwIntpexits[HWINTCNT_SIZE];
+       uint64_t                cpu_hwIntcexits[HWINTCNT_SIZE];
        uint64_t                cpu_dr7; /* debug control register */
        uint64_t                cpu_int_event_time;     /* intr entry/exit time */
-#if CONFIG_VMX
-       vmx_cpu_t               cpu_vmx;                /* wonderful world of virtualization */
-#endif
-#if CONFIG_MCA
-       struct mca_state        *cpu_mca_state;         /* State at MC fault */
-#endif
        uint64_t                cpu_uber_arg_store;     /* Double mapped address
                                                         * of current thread's
                                                         * uu_arg array.
        uint64_t                cpu_c7res;
        uint64_t                cpu_itime_total;
        uint64_t                cpu_rtime_total;
-       uint64_t                cpu_rtimes[4];
-       uint64_t                cpu_itimes[4];
        uint64_t                cpu_ixtime;
+       uint64_t                cpu_idle_exits;
+       uint64_t                cpu_rtimes[CPU_RTIME_BINS];
+       uint64_t                cpu_itimes[CPU_ITIME_BINS];
+       uint64_t                cpu_cur_insns;
+       uint64_t                cpu_cur_ucc;
+       uint64_t                cpu_cur_urc;
        uint64_t                cpu_max_observed_int_latency;
        int                     cpu_max_observed_int_latency_vector;
        uint64_t                debugger_entry_time;
+       uint64_t                debugger_ipi_time;
        volatile boolean_t      cpu_NMI_acknowledged;
        /* A separate nested interrupt stack flag, to account
         * for non-nested interrupts arriving while on the interrupt stack
        uint32_t                cpu_nested_istack_events;
        x86_saved_state64_t     *cpu_fatal_trap_state;
        x86_saved_state64_t     *cpu_post_fatal_trap_state;
+#if CONFIG_VMX
+       vmx_cpu_t               cpu_vmx;                /* wonderful world of virtualization */
+#endif
+#if CONFIG_MCA
+       struct mca_state        *cpu_mca_state;         /* State at MC fault */
+#endif
+       int                     cpu_type;
+       int                     cpu_subtype;
+       int                     cpu_threadtype;
+       boolean_t               cpu_iflag;
+       boolean_t               cpu_boot_complete;
+       int                     cpu_hibernate;
 } cpu_data_t;
 
 extern cpu_data_t      *cpu_data_ptr[];  
 
     void               *pmStats;       /* Power Management stats for package*/
     void               *pmState;       /* Power Management state for package*/
     struct mca_state   *mca_state;     /* MCA state for memory errors */
+    uint64_t           package_idle_exits;
     uint32_t           num_idle;
 } x86_pkg_t;
 
 
        { 0x70, CACHE,  TRACE,          8,      12*K,   NA  },
        { 0x71, CACHE,  TRACE,          8,      16*K,   NA  },
        { 0x72, CACHE,  TRACE,          8,      32*K,   NA  },
+       { 0x76, TLB,    INST,           NA,     BOTH,   8   },
        { 0x78, CACHE,  L2,             4,      1*M,    64  },
        { 0x79, CACHE,  L2_2LINESECTOR, 8,      128*K,  64  },
        { 0x7A, CACHE,  L2_2LINESECTOR, 8,      256*K,  64  },
        { 0xB2, TLB,    INST,           4,      SMALL,  64  },  
        { 0xB3, TLB,    DATA,           4,      SMALL,  128 },  
        { 0xB4, TLB,    DATA1,          4,      SMALL,  256 },  
+       { 0xB5, TLB,    DATA1,          8,      SMALL,  64  },  
+       { 0xB6, TLB,    DATA1,          8,      SMALL,  128 },  
        { 0xBA, TLB,    DATA1,          4,      BOTH,   64  },  
-       { 0xCA, STLB,   DATA1,          4,      BOTH,   512 },  
+       { 0xC1, STLB,   DATA1,          8,      SMALL,  1024},  
+       { 0xCA, STLB,   DATA1,          4,      SMALL,  512 },  
        { 0xD0, CACHE,  L3,             4,      512*K,  64  },  
        { 0xD1, CACHE,  L3,             4,      1*M,    64  },  
        { 0xD2, CACHE,  L3,             4,      2*M,    64  },  
                ctp->sensor               = bitfield32(reg[eax], 0, 0);
                ctp->dynamic_acceleration = bitfield32(reg[eax], 1, 1);
                ctp->invariant_APIC_timer = bitfield32(reg[eax], 2, 2);
-               ctp->core_power_limits    = bitfield32(reg[eax], 3, 3);
-               ctp->fine_grain_clock_mod = bitfield32(reg[eax], 4, 4);
-               ctp->package_thermal_intr = bitfield32(reg[eax], 5, 5);
+               ctp->core_power_limits    = bitfield32(reg[eax], 4, 4);
+               ctp->fine_grain_clock_mod = bitfield32(reg[eax], 5, 5);
+               ctp->package_thermal_intr = bitfield32(reg[eax], 6, 6);
                ctp->thresholds           = bitfield32(reg[ebx], 3, 0);
                ctp->ACNT_MCNT            = bitfield32(reg[ecx], 0, 0);
                ctp->hardware_feedback    = bitfield32(reg[ecx], 1, 1);
-               ctp->energy_policy        = bitfield32(reg[ecx], 2, 2);
+               ctp->energy_policy        = bitfield32(reg[ecx], 3, 3);
                info_p->cpuid_thermal_leafp = ctp;
 
                DBG(" Thermal/Power Leaf:\n");
                DBG("  package_thermal_intr : %d\n", ctp->package_thermal_intr);
                DBG("  thresholds           : %d\n", ctp->thresholds);
                DBG("  ACNT_MCNT            : %d\n", ctp->ACNT_MCNT);
-               DBG("  hardware_feedback    : %d\n", ctp->hardware_feedback);
+               DBG("  ACNT2                : %d\n", ctp->hardware_feedback);
                DBG("  energy_policy        : %d\n", ctp->energy_policy);
        }
 
                DBG("  EDX           : 0x%x\n", xsp->extended_state[edx]);
        }
 
-       if (info_p->cpuid_model == CPUID_MODEL_IVYBRIDGE) {
+       if (info_p->cpuid_model >= CPUID_MODEL_IVYBRIDGE) {
                /*
-                * XSAVE Features:
+                * Leaf7 Features:
                 */
                cpuid_fn(0x7, reg);
                info_p->cpuid_leaf7_features = reg[ebx];
                case CPUID_MODEL_IVYBRIDGE:
                        cpufamily = CPUFAMILY_INTEL_IVYBRIDGE;
                        break;
+               case CPUID_MODEL_HASWELL:
+               case CPUID_MODEL_HASWELL_ULT:
+               case CPUID_MODEL_CRYSTALWELL:
+                       cpufamily = CPUFAMILY_INTEL_HASWELL;
+                       break;
                }
                break;
        }
                info_p->thread_count = bitfield32((uint32_t)msr, 15,  0);
                break;
                }
+       case CPUFAMILY_INTEL_HASWELL:
        case CPUFAMILY_INTEL_IVYBRIDGE:
        case CPUFAMILY_INTEL_SANDYBRIDGE:
        case CPUFAMILY_INTEL_NEHALEM: {
        {CPUID_FEATURE_TM2,       "TM2"},
        {CPUID_FEATURE_SSSE3,     "SSSE3"},
        {CPUID_FEATURE_CID,       "CID"},
+       {CPUID_FEATURE_FMA,       "FMA"},
        {CPUID_FEATURE_CX16,      "CX16"},
        {CPUID_FEATURE_xTPR,      "TPR"},
        {CPUID_FEATURE_PDCM,      "PDCM"},
        {CPUID_FEATURE_SSE4_1,    "SSE4.1"},
        {CPUID_FEATURE_SSE4_2,    "SSE4.2"},
-       {CPUID_FEATURE_xAPIC,     "xAPIC"},
+       {CPUID_FEATURE_x2APIC,    "x2APIC"},
        {CPUID_FEATURE_MOVBE,     "MOVBE"},
        {CPUID_FEATURE_POPCNT,    "POPCNT"},
        {CPUID_FEATURE_AES,       "AES"},
 },
 leaf7_feature_map[] = {
        {CPUID_LEAF7_FEATURE_RDWRFSGS, "RDWRFSGS"},
+       {CPUID_LEAF7_FEATURE_TSCOFF,   "TSC_THREAD_OFFSET"},
+       {CPUID_LEAF7_FEATURE_BMI1,     "BMI1"},
+       {CPUID_LEAF7_FEATURE_HLE,      "HLE"},
        {CPUID_LEAF7_FEATURE_SMEP,     "SMEP"},
+       {CPUID_LEAF7_FEATURE_AVX2,     "AVX2"},
+       {CPUID_LEAF7_FEATURE_BMI2,     "BMI2"},
        {CPUID_LEAF7_FEATURE_ENFSTRG,  "ENFSTRG"},
+       {CPUID_LEAF7_FEATURE_INVPCID,  "INVPCID"},
+       {CPUID_LEAF7_FEATURE_RTM,      "RTM"},
        {0, 0}
 };
 
 
 #define CPUID_FEATURE_SSSE3     _HBit(9)  /* Supplemental SSE3 instructions */
 #define CPUID_FEATURE_CID       _HBit(10) /* L1 Context ID */
 #define CPUID_FEATURE_SEGLIM64  _HBit(11) /* 64-bit segment limit checking */
+#define CPUID_FEATURE_FMA       _HBit(12) /* Fused-Multiply-Add support */
 #define CPUID_FEATURE_CX16      _HBit(13) /* CmpXchg16b instruction */
 #define CPUID_FEATURE_xTPR      _HBit(14) /* Send Task PRiority msgs */
 #define CPUID_FEATURE_PDCM      _HBit(15) /* Perf/Debug Capability MSR */
 #define CPUID_FEATURE_DCA       _HBit(18) /* Direct Cache Access */
 #define CPUID_FEATURE_SSE4_1    _HBit(19) /* Streaming SIMD extensions 4.1 */
 #define CPUID_FEATURE_SSE4_2    _HBit(20) /* Streaming SIMD extensions 4.2 */
-#define CPUID_FEATURE_xAPIC     _HBit(21) /* Extended APIC Mode */
+#define CPUID_FEATURE_x2APIC    _HBit(21) /* Extended APIC Mode */
 #define CPUID_FEATURE_MOVBE     _HBit(22) /* MOVBE instruction */
 #define CPUID_FEATURE_POPCNT    _HBit(23) /* POPCNT instruction */
 #define CPUID_FEATURE_TSCTMR    _HBit(24) /* TSC deadline timer */
  * Bits returned in %ebx to a CPUID request with {%eax,%ecx} of (0x7,0x0}:
  */
 #define CPUID_LEAF7_FEATURE_RDWRFSGS _Bit(0)   /* FS/GS base read/write */
+#define CPUID_LEAF7_FEATURE_TSCOFF   _Bit(1)   /* TSC thread offset */
+#define CPUID_LEAF7_FEATURE_BMI1     _Bit(3)   /* Bit Manipulation Instrs, set 1 */
+#define CPUID_LEAF7_FEATURE_HLE      _Bit(4)   /* Hardware Lock Elision*/
+#define CPUID_LEAF7_FEATURE_AVX2     _Bit(5)   /* AVX2 Instructions */
 #define CPUID_LEAF7_FEATURE_SMEP     _Bit(7)   /* Supervisor Mode Execute Protect */
+#define CPUID_LEAF7_FEATURE_BMI2     _Bit(8)   /* Bit Manipulation Instrs, set 2 */
 #define CPUID_LEAF7_FEATURE_ENFSTRG  _Bit(9)   /* ENhanced Fast STRinG copy */
+#define CPUID_LEAF7_FEATURE_INVPCID  _Bit(10)  /* INVPCID intruction, TDB */
+#define CPUID_LEAF7_FEATURE_RTM      _Bit(11)  /* TBD */
 
 /*
  * The CPUID_EXTFEATURE_XXX values define 64-bit values
 #define CPUID_MODEL_SANDYBRIDGE        0x2A
 #define CPUID_MODEL_JAKETOWN   0x2D
 #define CPUID_MODEL_IVYBRIDGE  0x3A
+#define CPUID_MODEL_HASWELL    0x3C
+#define CPUID_MODEL_HASWELL_SVR        0x3F
+#define CPUID_MODEL_HASWELL_ULT        0x45
+#define CPUID_MODEL_CRYSTALWELL        0x46
 
 
 #define CPUID_VMM_FAMILY_UNKNOWN       0x0
 
 void
 etimer_resync_deadlines(void)
 {
-       uint64_t                deadline;
+       uint64_t                deadline = EndOfAllTime;
        uint64_t                pmdeadline;
        rtclock_timer_t         *mytimer;
        spl_t                   s = splclock();
        uint32_t                decr;
 
        pp = current_cpu_datap();
-       deadline = EndOfAllTime;
+       if (!pp->cpu_running)
+               /* There's really nothing to do if this procesor is down */
+               return;
 
        /*
         * If we have a clock timer set, pick that.
 
        if (fp_kind == FP_NO)
            return KERN_FAILURE;
 
+       if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
+           !ml_fpu_avx_enabled())
+           return KERN_FAILURE;
+
        state = (x86_float_state64_t *)tstate;
 
        assert(thr_act != THREAD_NULL);
        if (fp_kind == FP_NO)
                return KERN_FAILURE;
 
+       if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
+           !ml_fpu_avx_enabled())
+               return KERN_FAILURE;
+
        state = (x86_float_state64_t *)tstate;
 
        assert(thr_act != THREAD_NULL);
 
                offsetof(cpu_data_t *,cpu_number));
         DECLARE("CPU_RUNNING",
                offsetof(cpu_data_t *,cpu_running));
-        DECLARE("CPU_MCOUNT_OFF",
-               offsetof(cpu_data_t *,cpu_mcount_off));
        DECLARE("CPU_PENDING_AST",
                offsetof(cpu_data_t *,cpu_pending_ast));
        DECLARE("CPU_DESC_TABLEP",
 
     index = (virt >> I386_LPGSHIFT);
     virt += (uintptr_t)(phys & I386_LPGMASK);
     phys  = ((phys & ~((uint64_t)I386_LPGMASK)) | INTEL_PTE_PS  | INTEL_PTE_VALID | INTEL_PTE_WRITE);
+    if (phys == BootPTD[index]) return (virt);
     BootPTD[index] = phys;
     invlpg(virt);
     BootPTD[index + 1] = (phys + I386_LPGBYTES);
 
 
        tsc_init();
        power_management_init();
-
        processor_bootstrap();
        thread_bootstrap();
 
                mca_cpu_init();
 #endif
   
+               LAPIC_INIT();
                lapic_configure();
                LAPIC_DUMP();
                LAPIC_CPU_MAP_DUMP();
 #if CONFIG_MTRR
                mtrr_update_cpu();
 #endif
+               /* update CPU microcode */
+               ucode_update_wake();
        } else
            init_param = FAST_SLAVE_INIT;
 
-       /* update CPU microcode */
-       ucode_update_wake();
-
 #if CONFIG_VMX
        /* resume VT operation */
        vmx_resume();
 
 /* Base vector for local APIC interrupt sources */
 int lapic_interrupt_base = LAPIC_DEFAULT_INTERRUPT_BASE;
 
-#define                MAX_LAPICIDS    (LAPIC_ID_MAX+1)
 int            lapic_to_cpu[MAX_LAPICIDS];
 int            cpu_to_lapic[MAX_CPUS];
 
 
 #define        LAPIC_MSR(reg)          (LAPIC_MSR_BASE + LAPIC_MSR_OFFSET(reg))
 
 typedef struct {
-       void            (*init) (void);
-       uint32_t        (*read) (lapic_register_t);
-       void            (*write)(lapic_register_t, uint32_t);
+       void            (*init)         (void);
+       uint32_t        (*read)         (lapic_register_t);
+       void            (*write)        (lapic_register_t, uint32_t);
+       uint64_t        (*read_icr)     (void);
+       void            (*write_icr)    (uint32_t, uint32_t);
 } lapic_ops_table_t;
 extern  lapic_ops_table_t *lapic_ops;
 
+#define LAPIC_INIT()                   lapic_ops->init();
 #define LAPIC_WRITE(reg,val)           lapic_ops->write(reg, val)
 #define LAPIC_READ(reg)                        lapic_ops->read(reg)
 #define LAPIC_READ_OFFSET(reg,off)     LAPIC_READ((reg)+(off))
+#define LAPIC_READ_ICR()               lapic_ops->read_icr()
+#define LAPIC_WRITE_ICR(dst,cmd)       lapic_ops->write_icr(dst, cmd)
 
 typedef enum {
        periodic,
 #define LAPIC_PM_INTERRUPT             0x7
 
 #define LAPIC_PMC_SWI_VECTOR           (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_PMC_SW_INTERRUPT)
+#define LAPIC_TIMER_VECTOR             (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_TIMER_INTERRUPT)
 
 /* The vector field is ignored for NMI interrupts via the LAPIC
  * or otherwise, so this is not an offset from the interrupt
 extern void            lapic_interrupt_counts(uint64_t intrs[256]);
 extern void            lapic_disable_timer(void);
 
+#define        MAX_LAPICIDS    (LAPIC_ID_MAX+1)
 #ifdef MP_DEBUG
-extern void            lapic_cpu_map_dump(void);
 #define LAPIC_CPU_MAP_DUMP()   lapic_cpu_map_dump()
 #define LAPIC_DUMP()           lapic_dump()
 #else
 
        vm_map_offset_t lapic_vbase64;
        /* Establish a map to the local apic */
 
-       lapic_vbase64 = (vm_offset_t)vm_map_min(kernel_map);
-       result = vm_map_find_space(kernel_map,
-                                  &lapic_vbase64,
-                                  round_page(LAPIC_SIZE), 0,
-                                  VM_MAKE_TAG(VM_MEMORY_IOKIT), &entry);
-       /* Convert 64-bit vm_map_offset_t to "pointer sized" vm_offset_t
-        */
-       lapic_vbase = (vm_offset_t) lapic_vbase64;
-       if (result != KERN_SUCCESS) {
-               panic("legacy_init: vm_map_find_entry FAILED (err=%d)", result);
+       if (lapic_vbase == 0) {
+               lapic_vbase64 = (vm_offset_t)vm_map_min(kernel_map);
+               result = vm_map_find_space(kernel_map,
+                                          &lapic_vbase64,
+                                          round_page(LAPIC_SIZE), 0,
+                                          VM_MAKE_TAG(VM_MEMORY_IOKIT), &entry);
+               /* Convert 64-bit vm_map_offset_t to "pointer sized" vm_offset_t
+                */
+               lapic_vbase = (vm_offset_t) lapic_vbase64;
+               if (result != KERN_SUCCESS) {
+                       panic("legacy_init: vm_map_find_entry FAILED (err=%d)", result);
+               }
+               vm_map_unlock(kernel_map);
+
+               /*
+                * Map in the local APIC non-cacheable, as recommended by Intel
+                * in section 8.4.1 of the "System Programming Guide".
+                * In fact, this is redundant because EFI will have assigned an
+                * MTRR physical range containing the local APIC's MMIO space as
+                * UC and this will override the default PAT setting.
+                */
+               pmap_enter(pmap_kernel(),
+                               lapic_vbase,
+                               (ppnum_t) i386_btop(lapic_pbase),
+                               VM_PROT_READ|VM_PROT_WRITE,
+                               VM_PROT_NONE,
+                               VM_WIMG_IO,
+                               TRUE);
        }
-       vm_map_unlock(kernel_map);
 
        /*
-        * Map in the local APIC non-cacheable, as recommended by Intel
-        * in section 8.4.1 of the "System Programming Guide".
-        * In fact, this is redundant because EFI will have assigned an
-        * MTRR physical range containing the local APIC's MMIO space as
-        * UC and this will override the default PAT setting.
+        * Set flat delivery model, logical processor id
+        * This should already be the default set.
         */
-       pmap_enter(pmap_kernel(),
-                       lapic_vbase,
-                       (ppnum_t) i386_btop(lapic_pbase),
-                       VM_PROT_READ|VM_PROT_WRITE,
-                       VM_PROT_NONE,
-                       VM_WIMG_IO,
-                       TRUE);
+       LAPIC_WRITE(DFR, LAPIC_DFR_FLAT);
+       LAPIC_WRITE(LDR, (get_cpu_number()) << LAPIC_LDR_SHIFT);
 }
 
 
        *LAPIC_MMIO(reg) = value;
 }
 
+static uint64_t
+legacy_read_icr(void)
+{
+       return (((uint64_t)*LAPIC_MMIO(ICRD)) << 32) | ((uint64_t)*LAPIC_MMIO(ICR));
+}
+
+static void
+legacy_write_icr(uint32_t dst, uint32_t cmd)
+{
+       *LAPIC_MMIO(ICRD) = dst << LAPIC_ICRD_DEST_SHIFT;
+       *LAPIC_MMIO(ICR) = cmd;
+}
+
 static lapic_ops_table_t legacy_ops = {
        legacy_init,
        legacy_read,
-       legacy_write
+       legacy_write,
+       legacy_read_icr,
+       legacy_write_icr
 };
 
+static boolean_t is_x2apic = FALSE;
+
 static void
 x2apic_init(void)
 {
+       uint32_t        lo;
+       uint32_t        hi;
+
+       rdmsr(MSR_IA32_APIC_BASE, lo, hi);
+       if ((lo & MSR_IA32_APIC_BASE_EXTENDED) == 0)  {
+               lo |= MSR_IA32_APIC_BASE_EXTENDED;
+               wrmsr(MSR_IA32_APIC_BASE, lo, hi);
+               kprintf("x2APIC mode enabled\n");
+       }
 }
 
 static uint32_t
        wrmsr(LAPIC_MSR(reg), value, 0);
 }
 
+static uint64_t
+x2apic_read_icr(void)
+{
+       return rdmsr64(LAPIC_MSR(ICR));;
+}
+
+static void
+x2apic_write_icr(uint32_t dst, uint32_t cmd)
+{
+         wrmsr(LAPIC_MSR(ICR), cmd, dst);
+}
+
 static lapic_ops_table_t x2apic_ops = {
        x2apic_init,
        x2apic_read,
-       x2apic_write
+       x2apic_write,
+       x2apic_read_icr,
+       x2apic_write_icr
 };
 
-
 void
 lapic_init(void)
 {
        uint32_t        hi;
        boolean_t       is_boot_processor;
        boolean_t       is_lapic_enabled;
-       boolean_t       is_x2apic;
 
        /* Examine the local APIC state */
        rdmsr(MSR_IA32_APIC_BASE, lo, hi);
        if (!is_boot_processor || !is_lapic_enabled)
                panic("Unexpected local APIC state\n");
 
+       /*
+        * If x2APIC is available and not already enabled, enable it.
+        * Unless overriden by boot-arg.
+        */
+       if (!is_x2apic && (cpuid_features() & CPUID_FEATURE_x2APIC)) {
+               PE_parse_boot_argn("-x2apic", &is_x2apic, sizeof(is_x2apic));
+               kprintf("x2APIC supported %s be enabled\n",
+                       is_x2apic ? "and will" : "but will not");
+       }
+
        lapic_ops = is_x2apic ? &x2apic_ops : &legacy_ops;
 
-       lapic_ops->init();
+       LAPIC_INIT();
 
+       kprintf("ID: 0x%x LDR: 0x%x\n", LAPIC_READ(ID), LAPIC_READ(LDR));
        if ((LAPIC_READ(VERSION)&LAPIC_VERSION_MASK) < 0x14) {
                panic("Local APIC version 0x%x, 0x14 or more expected\n",
                        (LAPIC_READ(VERSION)&LAPIC_VERSION_MASK));
                LAPIC_READ(APR)&LAPIC_APR_MASK,
                LAPIC_READ(PPR)&LAPIC_PPR_MASK);
        kprintf("Destination Format 0x%x Logical Destination 0x%x\n",
-               LAPIC_READ(DFR)>>LAPIC_DFR_SHIFT,
+               is_x2apic ? 0 : LAPIC_READ(DFR)>>LAPIC_DFR_SHIFT,
                LAPIC_READ(LDR)>>LAPIC_LDR_SHIFT);
        kprintf("%cEnabled %cFocusChecking SV 0x%x\n",
                BOOL(LAPIC_READ(SVR)&LAPIC_SVR_ENABLE),
                }
        }
 
-       /* Set flat delivery model, logical processor id */
-       LAPIC_WRITE(DFR, LAPIC_DFR_FLAT);
-       LAPIC_WRITE(LDR, (get_cpu_number()) << LAPIC_LDR_SHIFT);
-
        /* Accept all */
        LAPIC_WRITE(TPR, 0);
 
        state = ml_set_interrupts_enabled(FALSE);
 
        /* Wait for pending outgoing send to complete */
-       while (LAPIC_READ(ICR) & LAPIC_ICR_DS_PENDING) {
+       while (LAPIC_READ_ICR() & LAPIC_ICR_DS_PENDING) {
                cpu_pause();
        }
 
-       LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT);
-       LAPIC_WRITE(ICR, vector | LAPIC_ICR_DM_FIXED);
+       LAPIC_WRITE_ICR(cpu_to_lapic[cpu], vector | LAPIC_ICR_DM_FIXED);
 
        (void) ml_set_interrupts_enabled(state);
 }
 
 mca_get_availability(void)
 {
        uint64_t        features = cpuid_info()->cpuid_features;
-       uint32_t        family =  cpuid_info()->cpuid_family;
+       uint32_t        family =   cpuid_info()->cpuid_family;
+       uint32_t        model =    cpuid_info()->cpuid_model;
+       uint32_t        stepping = cpuid_info()->cpuid_stepping;
 
        mca_MCE_present = (features & CPUID_FEATURE_MCE) != 0;
        mca_MCA_present = (features & CPUID_FEATURE_MCA) != 0;
        mca_family = family;
-       
+
+       if ((model == CPUID_MODEL_HASWELL     && stepping < 3) ||
+           (model == CPUID_MODEL_HASWELL_ULT && stepping < 1) ||
+           (model == CPUID_MODEL_CRYSTALWELL && stepping < 1))
+               panic("Haswell pre-C0 steppings are not supported");
+
        /*
         * If MCA, the number of banks etc is reported by the IA32_MCG_CAP MSR.
         */
 
  * instead of spinning for clock_delay_until().
  */
 void
-ml_init_delay_spin_threshold(void)
+ml_init_delay_spin_threshold(int threshold_us)
 {
-       nanoseconds_to_absolutetime(10ULL * NSEC_PER_USEC, &delay_spin_threshold);
+       nanoseconds_to_absolutetime(threshold_us * NSEC_PER_USEC, &delay_spin_threshold);
 }
 
 boolean_t
 }
 
 /*
- * This is called from the machine-independent routine cpu_up()
+ * This is called from the machine-independent layer
  * to perform machine-dependent info updates. Defer to cpu_thread_init().
  */
 void
 }
 
 /*
- * This is called from the machine-independent routine cpu_down()
+ * This is called from the machine-independent layer
  * to perform machine-dependent info updates.
  */
 void
 ml_cpu_down(void)
 {
+       i386_deactivate_cpu();
+
        return;
 }
 
 
 
 void ml_get_timebase(unsigned long long *timestamp);
 void ml_init_lock_timeout(void); 
-void ml_init_delay_spin_threshold(void);
+void ml_init_delay_spin_threshold(int);
 
 boolean_t ml_delay_should_spin(uint64_t interval);
 
 
 #define FAST_SLAVE_INIT        ((void *)(uintptr_t)1)
 
 uint64_t ml_early_random(void);
+void cpu_pmc_control(void *);
 #endif /* _I386_MISC_PROTOS_H_ */
 
 #define        TRACE_MP_CPUS_CALL_LOCAL        MACHDBG_CODE(DBG_MACH_MP, 2)
 #define        TRACE_MP_CPUS_CALL_ACTION       MACHDBG_CODE(DBG_MACH_MP, 3)
 #define        TRACE_MP_CPUS_CALL_NOBUF        MACHDBG_CODE(DBG_MACH_MP, 4)
+#define        TRACE_MP_CPU_FAST_START         MACHDBG_CODE(DBG_MACH_MP, 5)
+#define        TRACE_MP_CPU_START              MACHDBG_CODE(DBG_MACH_MP, 6)
+#define        TRACE_MP_CPU_DEACTIVATE         MACHDBG_CODE(DBG_MACH_MP, 7)
 
 #define ABS(v)         (((v) > 0)?(v):-(v))
 
                 */
                return(rc);
 
+       KERNEL_DEBUG_CONSTANT(
+               TRACE_MP_CPU_FAST_START | DBG_FUNC_START,
+               slot_num, 0, 0, 0, 0);
+
        /*
         * Wait until the CPU is back online.
         */
        mp_wait_for_cpu_up(slot_num, 30000, 1);
        mp_enable_preemption();
 
+       KERNEL_DEBUG_CONSTANT(
+               TRACE_MP_CPU_FAST_START | DBG_FUNC_END,
+               slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0);
+
        /*
         * Check to make sure that the CPU is really running.  If not,
         * go through the slow path.
        if (cpu_number() != psip->starter_cpu)
                return;
 
+       DBG("start_cpu(%p) about to start cpu %d, lapic %d\n",
+               arg, psip->target_cpu, psip->target_lapic);
+
+       KERNEL_DEBUG_CONSTANT(
+               TRACE_MP_CPU_START | DBG_FUNC_START,
+               psip->target_cpu,
+               psip->target_lapic, 0, 0, 0);
+
        i386_start_cpu(psip->target_lapic, psip->target_cpu);
 
 #ifdef POSTCODE_DELAY
        /* Wait much longer if postcodes are displayed for a delay period. */
        i *= 10000;
 #endif
+       DBG("start_cpu(%p) about to wait for cpu %d\n",
+               arg, psip->target_cpu);
+
        mp_wait_for_cpu_up(psip->target_cpu, i*100, 100);
+
+       KERNEL_DEBUG_CONSTANT(
+               TRACE_MP_CPU_START | DBG_FUNC_END,
+               psip->target_cpu,
+               cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0);
+
        if (TSC_sync_margin &&
            cpu_datap(psip->target_cpu)->cpu_running) {
                /*
        cpu_data_t      *cdp = current_cpu_datap();
 
        assert(!ml_get_interrupts_enabled());
+ 
+       KERNEL_DEBUG_CONSTANT(
+               TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START,
+               0, 0, 0, 0, 0);
 
        simple_lock(&x86_topo_lock);
        cdp->cpu_running = FALSE;
        simple_unlock(&x86_topo_lock);
 
+       /*
+        * Move all of this cpu's timers to the master/boot cpu,
+        * and poke it in case there's a sooner deadline for it to schedule.
+        */
        timer_queue_shutdown(&cdp->rtclock_timer.queue);
-       cdp->rtclock_timer.deadline = EndOfAllTime;
        mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, etimer_timer_expire, NULL);
 
        /*
-        * In case a rendezvous/braodcast/call was initiated to this cpu
-        * before we cleared cpu_running, we must perform any actions due.
+        * Open an interrupt window
+        * and ensure any pending IPI or timer is serviced
         */
-       if (i_bit(MP_RENDEZVOUS, &cdp->cpu_signals))
-               mp_rendezvous_action();
-       if (i_bit(MP_BROADCAST, &cdp->cpu_signals))
-               mp_broadcast_action();
-       if (i_bit(MP_CALL, &cdp->cpu_signals))
-               mp_cpus_call_action();
-       cdp->cpu_signals = 0;                   /* all clear */
+       mp_disable_preemption();
+       ml_set_interrupts_enabled(TRUE);
+
+       while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime)
+               cpu_pause();
+       /*
+        * Ensure there's no remaining timer deadline set
+        * - AICPM may have left one active.
+        */
+       setPop(0);
+
+       ml_set_interrupts_enabled(FALSE);
+       mp_enable_preemption();
+
+       KERNEL_DEBUG_CONSTANT(
+               TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END,
+               0, 0, 0, 0, 0);
 }
 
 int    pmsafe_debug    = 1;
                        cpu_NMI_interrupt(cpu);
                }
 
-       DBG("mp_kdp_enter() %u processors done %s\n",
+       DBG("mp_kdp_enter() %d processors done %s\n",
            (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out");
        
        postcode(MP_KDP_ENTER);
        DBG("mp_kdp_wait()\n");
        /* If an I/O port has been specified as a debugging aid, issue a read */
        panic_io_port_read();
-
+       current_cpu_datap()->debugger_ipi_time = mach_absolute_time();
 #if CONFIG_MCA
        /* If we've trapped due to a machine-check, save MCA registers */
        mca_check_save();
                clock_init();
                cpu_machine_init();     /* Interrupts enabled hereafter */
                mp_cpus_call_cpu_init();
+       } else {
+               cpu_machine_init();     /* Interrupts enabled hereafter */
        }
 }
 
 
 void
 i386_start_cpu(int lapic_id, __unused int cpu_num )
 {
-       LAPIC_WRITE(ICRD, lapic_id << LAPIC_ICRD_DEST_SHIFT);
-       LAPIC_WRITE(ICR, LAPIC_ICR_DM_INIT);
+       LAPIC_WRITE_ICR(lapic_id, LAPIC_ICR_DM_INIT);
        delay(100);
-
-       LAPIC_WRITE(ICRD, lapic_id << LAPIC_ICRD_DEST_SHIFT);
-       LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(REAL_MODE_BOOTSTRAP_OFFSET>>12));
+       LAPIC_WRITE_ICR(lapic_id,
+                       LAPIC_ICR_DM_STARTUP|(REAL_MODE_BOOTSTRAP_OFFSET>>12));
 }
 
 void
 {
        boolean_t state = ml_set_interrupts_enabled(FALSE);
        /* Program the interrupt command register */
-       LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT);
        /* The vector is ignored in this case--the target CPU will enter on the
         * NMI vector.
         */
-       LAPIC_WRITE(ICR, LAPIC_VECTOR(INTERPROCESSOR)|LAPIC_ICR_DM_NMI);
+       LAPIC_WRITE_ICR(cpu_to_lapic[cpu],
+                       LAPIC_VECTOR(INTERPROCESSOR)|LAPIC_ICR_DM_NMI);
        (void) ml_set_interrupts_enabled(state);
 }
 
 
        volatile uint64_t       tsc_base;       /* timestamp */
        volatile uint64_t       ns_base;        /* nanoseconds */
        uint32_t                scale;          /* tsc -> nanosec multiplier */
-       uint32_t                shift;          /* tsc -> nanosec shift/div */
-                                               /* shift is overloaded with
-                                                * lower 32bits of tsc_freq
-                                                * on slower machines (SLOW_TSC_THRESHOLD) */
+       uint32_t                shift;          /* shift is nonzero only on "slow" machines, */
+                                               /* ie where tscFreq <= SLOW_TSC_THRESHOLD */
        volatile uint32_t       generation;     /* 0 == being updated */
        uint32_t                spare1;
 };
 
-
 /*
  * Copyright (c) 2009 Apple Inc. All rights reserved.
  *
 /* Include a PAL-specific header, too, for xnu-internal overrides */
 #include <i386/pal_native.h>
 
+
 extern boolean_t virtualized;
 #define PAL_VIRTUALIZED_PROPERTY_VALUE 4
 
 
                return fpu_set_fxstate(thr_act, tstate, flavor);
        }
 
+       case x86_AVX_STATE:
+       {   
+               x86_avx_state_t       *state;
+
+               if (count != x86_AVX_STATE_COUNT)
+                       return(KERN_INVALID_ARGUMENT);
+
+               state = (x86_avx_state_t *)tstate;
+               if (state->ash.flavor == x86_AVX_STATE64 &&
+                   state->ash.count  == x86_FLOAT_STATE64_COUNT &&
+                   thread_is_64bit(thr_act)) {
+                       return fpu_set_fxstate(thr_act,
+                                              (thread_state_t)&state->ufs.as64,
+                                              x86_FLOAT_STATE64);
+               }
+               if (state->ash.flavor == x86_FLOAT_STATE32 &&
+                   state->ash.count  == x86_FLOAT_STATE32_COUNT &&
+                   !thread_is_64bit(thr_act)) {
+                       return fpu_set_fxstate(thr_act,
+                                              (thread_state_t)&state->ufs.as32,
+                                              x86_FLOAT_STATE32); 
+               }
+               return(KERN_INVALID_ARGUMENT);
+       }
+
        case x86_THREAD_STATE32: 
        {
                if (count != x86_THREAD_STATE32_COUNT)
                break;
            }
 
+           case THREAD_STATE_FLAVOR_LIST_10_9:
+           {
+               if (*count < 5)
+                       return (KERN_INVALID_ARGUMENT);
+
+               tstate[0] = x86_THREAD_STATE;
+               tstate[1] = x86_FLOAT_STATE;
+               tstate[2] = x86_EXCEPTION_STATE;
+               tstate[3] = x86_DEBUG_STATE;
+               tstate[4] = x86_AVX_STATE;
+
+               *count = 5;
+               break;
+           }
+
            case x86_SAVED_STATE32:
            {
                x86_saved_state32_t     *state;
                return(kret);
            }
 
-       case x86_AVX_STATE32:
-       {
+           case x86_AVX_STATE32:
+           {
                if (*count != x86_AVX_STATE32_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
                *count = x86_AVX_STATE32_COUNT;
 
                return fpu_get_fxstate(thr_act, tstate, flavor);
-       }
+           }
 
-       case x86_AVX_STATE64:
-       {
+           case x86_AVX_STATE64:
+           {
                if (*count != x86_AVX_STATE64_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
                *count = x86_AVX_STATE64_COUNT;
 
                return fpu_get_fxstate(thr_act, tstate, flavor);
-       }
+           }
+
+           case x86_AVX_STATE:
+           {
+               x86_avx_state_t         *state;
+               kern_return_t           kret;
+
+               if (*count < x86_AVX_STATE_COUNT)
+                       return(KERN_INVALID_ARGUMENT);
+
+               state = (x86_avx_state_t *)tstate;
+
+               bzero((char *)state, sizeof(x86_avx_state_t));
+               if (thread_is_64bit(thr_act)) {
+                       state->ash.flavor = x86_AVX_STATE64;
+                       state->ash.count  = x86_AVX_STATE64_COUNT;
+                       kret = fpu_get_fxstate(thr_act,
+                                              (thread_state_t)&state->ufs.as64,
+                                              x86_AVX_STATE64);
+               } else {
+                       state->ash.flavor = x86_AVX_STATE32;
+                       state->ash.count  = x86_AVX_STATE32_COUNT;
+                       kret = fpu_get_fxstate(thr_act,
+                                              (thread_state_t)&state->ufs.as32,
+                                              x86_AVX_STATE32);
+               }
+               *count = x86_AVX_STATE_COUNT;
+
+               return(kret);
+           }
 
            case x86_THREAD_STATE32: 
            {
 
 #include <kern/sched_prim.h>
 #include <i386/lapic.h>
 #include <i386/pal_routines.h>
-
 #include <sys/kdebug.h>
 
 extern int disableConsoleOutput;
 
 #define DELAY_UNSET            0xFFFFFFFFFFFFFFFFULL
 
+uint64_t cpu_itime_bins[CPU_ITIME_BINS] = {16* NSEC_PER_USEC, 32* NSEC_PER_USEC, 64* NSEC_PER_USEC, 128* NSEC_PER_USEC, 256* NSEC_PER_USEC, 512* NSEC_PER_USEC, 1024* NSEC_PER_USEC, 2048* NSEC_PER_USEC, 4096* NSEC_PER_USEC, 8192* NSEC_PER_USEC, 16384* NSEC_PER_USEC, 32768* NSEC_PER_USEC};
+uint64_t *cpu_rtime_bins = &cpu_itime_bins[0];
+
 /*
  * The following is set when the KEXT loads and initializes.
  */
 pmDispatch_t   *pmDispatch     = NULL;
 
-static uint32_t                pmInitDone              = 0;
+uint32_t               pmInitDone              = 0;
 static boolean_t       earlyTopology           = FALSE;
 static uint64_t                earlyMaxBusDelay        = DELAY_UNSET;
 static uint64_t                earlyMaxIntDelay        = DELAY_UNSET;
        (*pmDispatch->cstateInit)();
 }
 
-#define CPU_ACTIVE_STAT_BIN_1 (500000)
-#define CPU_ACTIVE_STAT_BIN_2 (2000000)
-#define CPU_ACTIVE_STAT_BIN_3 (5000000)
-
-#define CPU_IDLE_STAT_BIN_1 (500000)
-#define CPU_IDLE_STAT_BIN_2 (2000000)
-#define CPU_IDLE_STAT_BIN_3 (5000000)
+static inline void machine_classify_interval(uint64_t interval, uint64_t *bins, uint64_t *binvals, uint32_t nbins) {
+       uint32_t i;
+       for (i = 0; i < nbins; i++) {
+               if (interval < binvals[i]) {
+                       bins[i]++;
+                       break;
+               }
+       }
+}
 
 /*
  * Called when the CPU is idle.  It calls into the power management kext
 void
 machine_idle(void)
 {
-    cpu_data_t         *my_cpu         = current_cpu_datap();
-    uint64_t           ctime, rtime, itime;
+       cpu_data_t              *my_cpu         = current_cpu_datap();
+       uint64_t                ctime, rtime, itime;
 
-    if (my_cpu == NULL)
-       goto out;
+       if (my_cpu == NULL)
+               goto out;
 
        ctime = mach_absolute_time();
 
-    my_cpu->lcpu.state = LCPU_IDLE;
-    DBGLOG(cpu_handle, cpu_number(), MP_IDLE);
-    MARK_CPU_IDLE(cpu_number());
+       my_cpu->lcpu.state = LCPU_IDLE;
+       DBGLOG(cpu_handle, cpu_number(), MP_IDLE);
+       MARK_CPU_IDLE(cpu_number());
 
        rtime = ctime - my_cpu->cpu_ixtime;
 
        my_cpu->cpu_rtime_total += rtime;
+       machine_classify_interval(rtime, &my_cpu->cpu_rtimes[0], &cpu_rtime_bins[0], CPU_RTIME_BINS);
+
+       if (pmInitDone) {
+               /*
+                * Handle case where ml_set_maxbusdelay() or ml_set_maxintdelay()
+                * were called prior to the CPU PM kext being registered.  We do
+                * this here since we know at this point the values will be first
+                * used since idle is where the decisions using these values is made.
+                */
+               if (earlyMaxBusDelay != DELAY_UNSET)
+                       ml_set_maxbusdelay((uint32_t)(earlyMaxBusDelay & 0xFFFFFFFF));
+
+               if (earlyMaxIntDelay != DELAY_UNSET)
+                       ml_set_maxintdelay(earlyMaxIntDelay);
+       }
 
-       if (rtime < CPU_ACTIVE_STAT_BIN_1)
-               my_cpu->cpu_rtimes[0]++;
-       else if (rtime < CPU_ACTIVE_STAT_BIN_2)
-               my_cpu->cpu_rtimes[1]++;
-       else if (rtime < CPU_ACTIVE_STAT_BIN_3)
-               my_cpu->cpu_rtimes[2]++;
-       else
-               my_cpu->cpu_rtimes[3]++;
-
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->MachineIdle != NULL)
+               (*pmDispatch->MachineIdle)(0x7FFFFFFFFFFFFFFFULL);
+       else {
+               /*
+                * If no power management, re-enable interrupts and halt.
+                * This will keep the CPU from spinning through the scheduler
+                * and will allow at least some minimal power savings (but it
+                * cause problems in some MP configurations w.r.t. the APIC
+                * stopping during a GV3 transition).
+                */
+               pal_hlt();
+
+               /* Once woken, re-disable interrupts. */
+               pal_cli();
+       }
 
-    if (pmInitDone) {
        /*
-        * Handle case where ml_set_maxbusdelay() or ml_set_maxintdelay()
-        * were called prior to the CPU PM kext being registered.  We do
-        * this here since we know at this point the values will be first
-        * used since idle is where the decisions using these values is made.
+        * Mark the CPU as running again.
         */
-       if (earlyMaxBusDelay != DELAY_UNSET)
-           ml_set_maxbusdelay((uint32_t)(earlyMaxBusDelay & 0xFFFFFFFF));
-
-       if (earlyMaxIntDelay != DELAY_UNSET)
-           ml_set_maxintdelay(earlyMaxIntDelay);
-    }
-
-    if (pmInitDone
-       && pmDispatch != NULL
-       && pmDispatch->MachineIdle != NULL)
-       (*pmDispatch->MachineIdle)(0x7FFFFFFFFFFFFFFFULL);
-    else {
-       /*
-        * If no power management, re-enable interrupts and halt.
-        * This will keep the CPU from spinning through the scheduler
-        * and will allow at least some minimal power savings (but it
-        * cause problems in some MP configurations w.r.t. the APIC
-        * stopping during a GV3 transition).
-        */
-       pal_hlt();
-
-       /* Once woken, re-disable interrupts. */
-       pal_cli();
-    }
-
-    /*
-     * Mark the CPU as running again.
-     */
-    MARK_CPU_ACTIVE(cpu_number());
-    DBGLOG(cpu_handle, cpu_number(), MP_UNIDLE);
+       MARK_CPU_ACTIVE(cpu_number());
+       DBGLOG(cpu_handle, cpu_number(), MP_UNIDLE);
 
        uint64_t ixtime = my_cpu->cpu_ixtime = mach_absolute_time();
-       itime = ixtime - ctime;
+       my_cpu->cpu_idle_exits++;
 
-    my_cpu->lcpu.state = LCPU_RUN;
+       itime = ixtime - ctime;
 
-       if (itime < CPU_IDLE_STAT_BIN_1)
-               my_cpu->cpu_itimes[0]++;
-       else if (itime < CPU_IDLE_STAT_BIN_2)
-               my_cpu->cpu_itimes[1]++;
-       else if (itime < CPU_IDLE_STAT_BIN_3)
-               my_cpu->cpu_itimes[2]++;
-       else
-               my_cpu->cpu_itimes[3]++;
+       my_cpu->lcpu.state = LCPU_RUN;
 
+       machine_classify_interval(itime, &my_cpu->cpu_itimes[0], &cpu_itime_bins[0], CPU_ITIME_BINS);
        my_cpu->cpu_itime_total += itime;
 
 
-    /*
-     * Re-enable interrupts.
-     */
-  out:
-    pal_sti();
+       /*
+        * Re-enable interrupts.
+        */
+out:
+       pal_sti();
 }
 
 /*
        break;
 
     case PM_HALT_NORMAL:
+    case PM_HALT_SLEEP:
     default:
         pal_cli();
 
            (*pmDispatch->pmCPUHalt)();
 
            /*
-            * We've exited halt, so get the the CPU schedulable again.
+            * We've exited halt, so get the CPU schedulable again.
+            * - by calling the fast init routine for a slave, or
+            * - by returning if we're the master processor.
             */
-           i386_init_slave_fast();
-
-           panic("init_slave_fast returned");
+           if (cpup->cpu_number != master_cpu) {
+               i386_init_slave_fast();
+               panic("init_slave_fast returned");
+           }
        } else
        {
            /*
     pmInitDone = 1;
 }
 
-static x86_lcpu_t *
+x86_lcpu_t *
 pmGetLogicalCPU(int cpu)
 {
     return(cpu_to_lcpu(cpu));
 }
 
-static x86_lcpu_t *
+x86_lcpu_t *
 pmGetMyLogicalCPU(void)
 {
     cpu_data_t *cpup   = current_cpu_datap();
 /*
  * Returns the root of the package tree.
  */
-static x86_pkg_t *
+x86_pkg_t *
 pmGetPkgRoot(void)
 {
     return(x86_pkgs);
     return(cpu_datap(cpu)->cpu_hibernate);
 }
 
-static processor_t
+processor_t
 pmLCPUtoProcessor(int lcpu)
 {
     return(cpu_datap(lcpu)->cpu_processor);
                && rtc_nanotime->generation != pal_rtc_nanotime_info.generation);
 }
 
-static uint32_t
+uint32_t
 pmTimerQueueMigrate(int target_cpu)
 {
     /* Call the etimer code to do this. */
     }
 
     if (cpuFuncs != NULL) {
+        if (pmDispatch) {
+            panic("Attempt to re-register power management interface--AICPM present in xcpm mode? %p->%p", pmDispatch, cpuFuncs);
+        }
+
        pmDispatch = cpuFuncs;
 
        if (earlyTopology
        if (entry) {
                (void)__sync_fetch_and_add(&my_cpu->lcpu.package->num_idle, 1);
        }
-       else {
-               (void)__sync_fetch_and_sub(&my_cpu->lcpu.package->num_idle, 1);
-       }
+       else {
+               uint32_t nidle = __sync_fetch_and_sub(&my_cpu->lcpu.package->num_idle, 1);
+               if (nidle == topoParms.nLThreadsPerPackage) {
+                       my_cpu->lcpu.package->package_idle_exits++;
+               }
+       }
 }
 
 void pmTimerRestore(void);
 kern_return_t pmCPUExitHalt(int cpu);
 kern_return_t pmCPUExitHaltToOff(int cpu);
+uint32_t pmTimerQueueMigrate(int);
 
 #define PM_HALT_NORMAL         0               /* normal halt path */
 #define PM_HALT_DEBUG          1               /* debug code wants to halt */
 #define                URGENCY_NOTIFICATION_ASSERT_NS (5 * 1000 * 1000)
 extern uint64_t        urgency_notification_assert_abstime_threshold;
 
+x86_lcpu_t *
+pmGetLogicalCPU(int cpu);
+x86_lcpu_t *
+pmGetMyLogicalCPU(void);
+processor_t
+pmLCPUtoProcessor(int lcpu);
+x86_pkg_t *
+pmGetPkgRoot(void);
+
+
 /******************************************************************************
  *
  * All of the following are deprecated interfaces and no longer used.
 
        set_cr3_raw(get_cr3_raw());
 }
 #endif
+extern int rdmsr64_carefully(uint32_t msr, uint64_t *val);
+extern int wrmsr64_carefully(uint32_t msr, uint64_t val);
 #endif /* MACH_KERNEL_PRIVATE */
 
 static inline void wbinvd(void)
  * The implementation is in locore.s.
  */
 extern int rdmsr_carefully(uint32_t msr, uint32_t *lo, uint32_t *hi);
-
 __END_DECLS
 
 #endif /* ASSEMBLER */
 #define MSR_IA32_MPERF                         0xE7
 #define MSR_IA32_APERF                         0xE8
 
-#define MSR_PMG_CST_CONFIG_CONTROL             0xe2
-
 #define MSR_IA32_BBL_CR_CTL                    0x119
 
 #define MSR_IA32_SYSENTER_CS                   0x174
 
 #define MSR_IA32_MISC_ENABLE                   0x1a0
 
-#define MSR_IA32_ENERGY_PERFORMANCE_BIAS       0x1b0
 #define MSR_IA32_PACKAGE_THERM_STATUS          0x1b1
 #define MSR_IA32_PACKAGE_THERM_INTERRUPT       0x1b2
 
 #define MSR_IA32_PKG_POWER_SKU_UNIT            0x606
 #define MSR_IA32_PKG_C2_RESIDENCY              0x60D
 #define MSR_IA32_PKG_ENERGY_STATUS             0x611
-#define MSR_IA32_PRIMARY_PLANE_ENERY_STATUS    0x639
-#define MSR_IA32_SECONDARY_PLANE_ENERY_STATUS  0x641
+
+#define MSR_IA32_DDR_ENERGY_STATUS             0x619
+#define MSR_IA32_LLC_FLUSHED_RESIDENCY_TIMER   0x61D
+#define MSR_IA32_RING_PERF_STATUS              0x621
+
+#define MSR_IA32_PKG_C8_RESIDENCY              0x630
+#define MSR_IA32_PKG_C9_RESIDENCY              0x631
+#define MSR_IA32_PKG_C10_RESIDENCY             0x632
+
+#define MSR_IA32_PP0_ENERGY_STATUS             0x639
+#define MSR_IA32_PP1_ENERGY_STATUS             0x641
+#define MSR_IA32_IA_PERF_LIMIT_REASONS         0x690
+#define MSR_IA32_GT_PERF_LIMIT_REASONS         0x6B0
+
 #define MSR_IA32_TSC_DEADLINE                  0x6e0
 
 #define        MSR_IA32_EFER                           0xC0000080
 
        etimer_resync_deadlines();
 }
 
-/*
- * tsc_to_nanoseconds:
- *
- * Basic routine to convert a raw 64 bit TSC value to a
- * 64 bit nanosecond value.  The conversion is implemented
- * based on the scale factor and an implicit 32 bit shift.
- */
-static inline uint64_t
-_tsc_to_nanoseconds(uint64_t value)
-{
-#if defined(__i386__)
-    asm volatile("movl %%edx,%%esi     ;"
-                "mull  %%ecx           ;"
-                "movl  %%edx,%%edi     ;"
-                "movl  %%esi,%%eax     ;"
-                "mull  %%ecx           ;"
-                "addl  %%edi,%%eax     ;"      
-                "adcl  $0,%%edx         "
-                : "+A" (value)
-                : "c" (pal_rtc_nanotime_info.scale)
-                : "esi", "edi");
-#elif defined(__x86_64__)
-    asm volatile("mul %%rcx;"
-                "shrq $32, %%rax;"
-                "shlq $32, %%rdx;"
-                "orq %%rdx, %%rax;"
-                : "=a"(value)
-                : "a"(value), "c"(pal_rtc_nanotime_info.scale)
-                : "rdx", "cc" );
-#else
-#error Unsupported architecture
-#endif
-
-    return (value);
-}
-
 static inline uint32_t
 _absolutetime_to_microtime(uint64_t abstime, clock_sec_t *secs, clock_usec_t *microsecs)
 {
 static inline uint64_t
 rtc_nanotime_read(void)
 {
-       
-#if CONFIG_EMBEDDED
-       if (gPEClockFrequencyInfo.timebase_frequency_hz > SLOW_TSC_THRESHOLD)
-               return  _rtc_nanotime_read(&rtc_nanotime_info, 1);      /* slow processor */
-       else
-#endif
-       return  _rtc_nanotime_read(&pal_rtc_nanotime_info, 0);  /* assume fast processor */
+       return  _rtc_nanotime_read(&pal_rtc_nanotime_info);
 }
 
 /*
 
        assert(!ml_get_interrupts_enabled());
        tsc = rdtsc64();
-       oldnsecs = rntp->ns_base + _tsc_to_nanoseconds(tsc - rntp->tsc_base);
-       newnsecs = base + _tsc_to_nanoseconds(tsc - tsc_base);
+       oldnsecs = rntp->ns_base + _rtc_tsc_to_nanoseconds(tsc - rntp->tsc_base, rntp);
+       newnsecs = base + _rtc_tsc_to_nanoseconds(tsc - tsc_base, rntp);
        
        /*
         * Only update the base values if time using the new base values
  * rtc_sleep_wakeup:
  *
  * Invoked from power management when we have awoken from a sleep (S3)
- * and the TSC has been reset.  The nanotime data is updated based on
- * the passed in value.
+ * and the TSC has been reset, or from Deep Idle (S0) sleep when the TSC
+ * has progressed.  The nanotime data is updated based on the passed-in value.
  *
  * The caller must guarantee non-reentrancy.
  */
                rtc_timer_init();
                clock_timebase_init();
                ml_init_lock_timeout();
-               ml_init_delay_spin_threshold();
+               ml_init_delay_spin_threshold(10);
        }
 
        /* Set fixed configuration for lapic timers */
 rtc_set_timescale(uint64_t cycles)
 {
        pal_rtc_nanotime_t      *rntp = &pal_rtc_nanotime_info;
+       uint32_t    shift = 0;
+    
+       /* the "scale" factor will overflow unless cycles>SLOW_TSC_THRESHOLD */
+    
+       while ( cycles <= SLOW_TSC_THRESHOLD) {
+               shift++;
+               cycles <<= 1;
+       }
+       
+       if ( shift != 0 )
+               printf("Slow TSC, rtc_nanotime.shift == %d\n", shift);
+    
        rntp->scale = (uint32_t)(((uint64_t)NSEC_PER_SEC << 32) / cycles);
 
-#if CONFIG_EMBEDDED
-       if (cycles <= SLOW_TSC_THRESHOLD)
-               rntp->shift = (uint32_t)cycles;
-       else
-#endif
-               rntp->shift = 32;
+       rntp->shift = shift;
 
        if (tsc_rebase_abs_time == 0)
                tsc_rebase_abs_time = mach_absolute_time();
 
 void
 machine_delay_until(
-       uint64_t                deadline)
+        uint64_t interval,
+        uint64_t                deadline)
 {
-       uint64_t                now;
-
-       do {
-               cpu_pause();
-               now = mach_absolute_time();
-       } while (now < deadline);
+        (void)interval;
+        while (mach_absolute_time() < deadline) {
+                cpu_pause();
+        }
 }
 
 
 /*
  * Assembly snippet included in exception handlers and rtc_nanotime_read()
+ *
+ *
+ * Warning!  There are several copies of this code in the trampolines found in
+ * osfmk/x86_64/idt64.s, coming from the various TIMER macros in rtclock_asm.h.
+ * They're all kept in sync by using the RTC_NANOTIME_READ() macro.
+ *
+ * The algorithm we use is:
+ *
+ *     ns = ((((rdtsc - rnt_tsc_base)<<rnt_shift)*rnt_tsc_scale) / 2**32) + rnt_ns_base;
+ *
+ * rnt_shift, a constant computed during initialization, is the smallest value for which:
+ *
+ *     (tscFreq << rnt_shift) > SLOW_TSC_THRESHOLD
+ *
+ * Where SLOW_TSC_THRESHOLD is about 10e9.  Since most processor's tscFreqs are greater
+ * than 1GHz, rnt_shift is usually 0.  rnt_tsc_scale is also a 32-bit constant:
+ *
+ *     rnt_tsc_scale = (10e9 * 2**32) / (tscFreq << rnt_shift);
+ *
  * %rdi points to nanotime info struct.
  * %rax returns nanotime
  */
        rdtsc                                                           ; \
        lfence                                                          ; \
        shlq    $32,%rdx                                                ; \
+       movl    RNT_SHIFT(%rdi),%ecx                                    ; \
        orq     %rdx,%rax                       /* %rax := tsc */       ; \
        subq    RNT_TSC_BASE(%rdi),%rax         /* tsc - tsc_base */    ; \
-       xorq    %rcx,%rcx                                               ; \
+       shlq    %cl,%rax                                                ; \
        movl    RNT_SCALE(%rdi),%ecx                                    ; \
        mulq    %rcx                            /* delta * scale */     ; \
        shrdq   $32,%rdx,%rax                   /* %rdx:%rax >>= 32 */  ; \
 
                        pal_rtc_nanotime_t      *dst);
 
 extern uint64_t        _rtc_nanotime_read(
-                       pal_rtc_nanotime_t      *rntp,
-                       int                     slow);
+                       pal_rtc_nanotime_t      *rntp);
+
+extern uint64_t _rtc_tsc_to_nanoseconds(
+                       uint64_t    value,
+                       pal_rtc_nanotime_t      *rntp);
 
 extern void    rtclock_intr(x86_saved_state_t *regs);
 
 
 #include <mach/i386/syscall_sw.h>
 
 #include <libkern/OSDebug.h>
-
+#include <i386/cpu_threads.h>
 #include <machine/pal_routines.h>
 
 extern void throttle_lowpri_io(int);
        int             ipl;
        int             cnum = cpu_number();
        int             itype = 0;
-       
+
        if (is_saved_state64(state) == TRUE) {
                x86_saved_state64_t     *state64;
 
                interrupt_num = state32->trapno;
        }
 
+       if (cpu_data_ptr[cnum]->lcpu.package->num_idle == topoParms.nLThreadsPerPackage)
+               cpu_data_ptr[cnum]->cpu_hwIntpexits[interrupt_num]++;
+
        if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_INTERPROCESSOR_INTERRUPT))
                itype = 1;
        else if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_TIMER_INTERRUPT))
 
        busFreq = EFI_FSB_frequency();
 
        switch (cpuid_cpufamily()) {
+       case CPUFAMILY_INTEL_HASWELL:
        case CPUFAMILY_INTEL_IVYBRIDGE:
        case CPUFAMILY_INTEL_SANDYBRIDGE:
        case CPUFAMILY_INTEL_WESTMERE:
        }
 
        kprintf(" BUS: Frequency = %6d.%06dMHz, "
-               "cvtt2n = %08Xx.%08Xx, cvtn2t = %08Xx.%08Xx\n",
+               "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X\n",
                (uint32_t)(busFreq / Mega),
                (uint32_t)(busFreq % Mega), 
                (uint32_t)(busFCvtt2n >> 32), (uint32_t)busFCvtt2n,
        tscFCvtn2t = 0xFFFFFFFFFFFFFFFFULL / tscFCvtt2n;
 
        kprintf(" TSC: Frequency = %6d.%06dMHz, "
-               "cvtt2n = %08Xx.%08Xx, cvtn2t = %08Xx.%08Xx, gran = %lld%s\n",
+               "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X, gran = %lld%s\n",
                (uint32_t)(tscFreq / Mega),
                (uint32_t)(tscFreq % Mega), 
                (uint32_t)(tscFCvtt2n >> 32), (uint32_t)tscFCvtt2n,
 
 
 #define BASE_NHM_CLOCK_SOURCE  133333333ULL
 #define IA32_PERF_STS          0x198
-#define        SLOW_TSC_THRESHOLD      1000067800      /* TSC is too slow for regular nanotime() algorithm */
+#define        SLOW_TSC_THRESHOLD      1000067800      /* if slower, nonzero shift required in nanotime() algorithm */
 
 #ifndef ASSEMBLER
 extern uint64_t        busFCvtt2n;
 
        if (    ml_delay_should_spin(interval)  ||
                        get_preemption_level() != 0                             ||
                        ml_get_interrupts_enabled() == FALSE    ) {
-               machine_delay_until(deadline);
+               machine_delay_until(interval, deadline);
        } else {
                assert_wait_deadline((event_t)clock_delay_until, THREAD_UNINT, deadline);
 
 
                                                clock_sec_t                             *secs,
                                                clock_usec_t                    *microsecs);
 
-extern void                    machine_delay_until(
+extern void                    machine_delay_until(uint64_t interval,
                                                uint64_t                deadline);
 
 extern uint32_t                hz_tick_interval;
 
 }
 
 /*
- * Called at splsched.
+ * Called with interrupts disabled.
  */
 void
 processor_doshutdown(
 {
        thread_t                        old_thread, self = current_thread();
        processor_t                     prev;
+       processor_set_t                 pset;
 
        /*
         *      Get onto the processor to shutdown
        prev = thread_bind(processor);
        thread_block(THREAD_CONTINUE_NULL);
 
-#if HIBERNATION
-       if (processor_avail_count < 2)
-               hibernate_vm_lock();
-#endif
-
        assert(processor->state == PROCESSOR_SHUTDOWN);
 
+       ml_cpu_down();
+
 #if HIBERNATION
-       if (processor_avail_count < 2)
+       if (processor_avail_count < 2) {
+               hibernate_vm_lock();
                hibernate_vm_unlock();
+       }
 #endif
 
+       pset = processor->processor_set;
+       pset_lock(pset);
+       processor->state = PROCESSOR_OFF_LINE;
+       if (--pset->online_processor_count == 0) {
+               pset_pri_init_hint(pset, PROCESSOR_NULL);
+               pset_count_init_hint(pset, PROCESSOR_NULL);
+       }
+       (void)hw_atomic_sub(&processor_avail_count, 1);
+       commpage_update_active_cpus();
+       SCHED(processor_queue_shutdown)(processor);
+       /* pset lock dropped */
+
        /*
         *      Continue processor shutdown in shutdown context.
         */
 }
 
 /*
- *     Complete the shutdown and place the processor offline.
+ *Complete the shutdown and place the processor offline.
  *
  *     Called at splsched in the shutdown context.
  */
        processor_t                     processor)
 {
        thread_t                        new_thread, old_thread = processor->active_thread;
-       processor_set_t         pset;
 
        new_thread = processor->idle_thread;
        processor->active_thread = new_thread;
 
        PMAP_DEACTIVATE_KERNEL(processor->cpu_id);
 
-       pset = processor->processor_set;
-       pset_lock(pset);
-       processor->state = PROCESSOR_OFF_LINE;
-       if (--pset->online_processor_count == 0) {
-               pset_pri_init_hint(pset, PROCESSOR_NULL);
-               pset_count_init_hint(pset, PROCESSOR_NULL);
-       }
-       (void)hw_atomic_sub(&processor_avail_count, 1);
-       commpage_update_active_cpus();
-       SCHED(processor_queue_shutdown)(processor);
-       /* pset lock dropped */
-
-       ml_cpu_down();
-
        cpu_sleep();
        panic("zombie processor");
        /*NOTREACHED*/
 
        int                                     cpu_id,
        processor_set_t         pset)
 {
+       spl_t           s;
+
        if (processor != master_processor) {
                /* Scheduler state deferred until sched_init() */
                SCHED(processor_init)(processor);
        processor_data_init(processor);
        processor->processor_list = NULL;
 
+       s = splsched();
        pset_lock(pset);
        if (pset->cpu_set_count++ == 0)
                pset->cpu_set_low = pset->cpu_set_hi = cpu_id;
                pset->cpu_set_hi = (cpu_id > pset->cpu_set_hi)? cpu_id: pset->cpu_set_hi;
        }
        pset_unlock(pset);
+       splx(s);
 
        simple_lock(&processor_list_lock);
        if (processor_list == NULL)
 
 #include <pmc/pmc.h>
 #endif
 
+#include <i386/pmCPU.h>
 static void            kernel_bootstrap_thread(void);
 
 static void            load_context(
 
 /* size of kernel trace buffer, disabled by default */
 unsigned int new_nkdbufs = 0;
+unsigned int wake_nkdbufs = 0;
 
 /* mach leak logging */
 int log_leaks = 0;
 
        PE_parse_boot_argn("trace", &new_nkdbufs, sizeof (new_nkdbufs));
 
+       PE_parse_boot_argn("trace_wake", &wake_nkdbufs, sizeof (wake_nkdbufs));
+
        /* i386_vm_init already checks for this ; do it aagin anyway */
         if (PE_parse_boot_argn("serverperfmode", &serverperfmode, sizeof (serverperfmode))) {
                 serverperfmode = 1;
 #if (defined(__i386__) || defined(__x86_64__))
        if (turn_on_log_leaks && !new_nkdbufs)
                new_nkdbufs = 200000;
-       start_kern_tracing(new_nkdbufs);
+       start_kern_tracing(new_nkdbufs, FALSE);
        if (turn_on_log_leaks)
                log_leaks = 1;
 #endif
 #if (!defined(__i386__) && !defined(__x86_64__))
        if (turn_on_log_leaks && !new_nkdbufs)
                new_nkdbufs = 200000;
-       start_kern_tracing(new_nkdbufs);
+       start_kern_tracing(new_nkdbufs, FALSE);
        if (turn_on_log_leaks)
                log_leaks = 1;
 #endif
 
 #ifndef        _MACH_BRANCH_PREDICATES_H
 #define        _MACH_BRANCH_PREDICATES_H
 
-#define        __probable(x)   __builtin_expect((x), 1)
-#define        __improbable(x) __builtin_expect((x), 0)
+#define        __probable(x)   __builtin_expect((long)(x), 1L)
+#define        __improbable(x) __builtin_expect((long)(x), 0L)
 #endif /* _MACH_BRANCH_PREDICATES_H */
 
 #define x86_DEBUG_STATE64              11
 #define x86_DEBUG_STATE                        12
 #define THREAD_STATE_NONE              13
-/* 15 and 16 are used for the internal x86_SAVED_STATE flavours */
+/* 14 and 15 are used for the internal x86_SAVED_STATE flavours */
 #define x86_AVX_STATE32                        16
 #define x86_AVX_STATE64                        17
+#define x86_AVX_STATE                  18
 
 
 /*
          (x == x86_DEBUG_STATE)        || \
          (x == x86_AVX_STATE32)        || \
          (x == x86_AVX_STATE64)        || \
+         (x == x86_AVX_STATE)          || \
          (x == THREAD_STATE_NONE))
 
 struct x86_state_hdr {
        } uds;
 };
 
+struct x86_avx_state {
+       x86_state_hdr_t                 ash;
+       union {
+               x86_avx_state32_t       as32;
+               x86_avx_state64_t       as64;
+       } ufs;
+};
+
 typedef struct x86_thread_state x86_thread_state_t;
 #define x86_THREAD_STATE_COUNT ((mach_msg_type_number_t) \
                ( sizeof (x86_thread_state_t) / sizeof (int) ))
 #define x86_DEBUG_STATE_COUNT ((mach_msg_type_number_t) \
                (sizeof(x86_debug_state_t)/sizeof(unsigned int)))
 
+typedef struct x86_avx_state x86_avx_state_t;
+#define x86_AVX_STATE_COUNT ((mach_msg_type_number_t) \
+               (sizeof(x86_avx_state_t)/sizeof(unsigned int)))
+
 /*
  * Machine-independent way for servers and Mach's exception mechanism to
  * choose the most efficient state flavor for exception RPC's:
 
                 * task_extmod_info_t (8 64-bit ints)
                 * task_basic_info_64_2_t
                 * mach_task_basic_info_t (12 ints)
+                * task_power_info_t (18 ints)
                 * If other task_info flavors are added, this
                 * definition may need to be changed. (See
                 * mach/task_info.h and mach/policy.h) */
                 *      kernel_resource_sizes_t (5 ints)
                 *      host_load_info_t (6 ints)
                 *      vm_statistics32_t (15 ints)
+                *      host_expired_task_info uses a task_power_info (18 ints)
                 * 
                 * If other host_info flavors are added, this definition may
                 * need to be changed. (See mach/{host_info,vm_statistics}.h)
                 */
 type host_flavor_t             = int;
-type host_info_t               = array[*:15] of integer_t;
+type host_info_t               = array[*:18] of integer_t;
 
 
                /* 
 
 #define CPUFAMILY_INTEL_WESTMERE       0x573b5eec
 #define CPUFAMILY_INTEL_SANDYBRIDGE    0x5490b78c
 #define CPUFAMILY_INTEL_IVYBRIDGE      0x1f65e835
+#define CPUFAMILY_INTEL_HASWELL                0x10b282dc
 #define CPUFAMILY_ARM_9                        0xe73283ae
 #define CPUFAMILY_ARM_11               0x8ff620d8
 #define CPUFAMILY_ARM_XSCALE           0x53b005f5
 
 
 #define        THREAD_STATE_FLAVOR_LIST        0       /* List of valid flavors */
 #define THREAD_STATE_FLAVOR_LIST_NEW   128
+#define THREAD_STATE_FLAVOR_LIST_10_9  129
 
 typedef        int                     thread_state_flavor_t;
 typedef thread_state_flavor_t  *thread_state_flavor_array_t;
 
        vm_object_t             object;
        vm_object_offset_t      offset;
        vm_object_offset_t      pg_offset;
-       vm_map_entry_t          entry;
+       vm_map_entry_t          entry = NULL;
        vm_map_offset_t         map_addr, fill_start;
        vm_map_offset_t         map_mask;
        vm_map_size_t           map_size, fill_size;
 
 export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
 export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
 
-
 include $(MakeInc_cmd)
 include $(MakeInc_def)
 
-EXPORT_ONLY_FILES =
-
-INSTALL_MD_DIR = x86_64
-
-INSTALL_MD_LIST = 
-
-INSTALL_MD_LCL_LIST =
-
-EXPORT_MD_LIST = ${EXPORT_ONLY_FILES}
-
-EXPORT_MD_DIR = x86_64
 
 include $(MakeInc_rule)
 include $(MakeInc_dir)
 
        movq    $1, %rax
        ret
 
+/*
+ * int rdmsr64_carefully(uint32_t msr, uint64_t *val);
+ */
+
+ENTRY(rdmsr64_carefully)
+       movl    %edi, %ecx
+       RECOVERY_SECTION
+       RECOVER(rdmsr64_carefully_fail)
+       rdmsr
+       movl    %eax, (%rsi)
+       movl    %edx, 4(%rsi)
+       xorl    %eax, %eax
+       ret
+rdmsr64_carefully_fail:
+       movl    $1, %eax
+       ret
+/*
+ * int wrmsr64_carefully(uint32_t msr, uint64_t val);
+ */
+
+ENTRY(wrmsr_carefully)
+       movl    %edi, %ecx
+       movl    %esi, %eax
+       shr     $32, %rsi
+       movl    %esi, %edx
+       RECOVERY_SECTION
+       RECOVER(wrmsr_fail)
+       wrmsr
+       xorl    %eax, %eax
+       ret
+wrmsr_fail:
+       movl    $1, %eax
+       ret
+
 .globl EXT(thread_exception_return)
 .globl EXT(thread_bootstrap_return)
 LEXT(thread_bootstrap_return)
 
        ret
 
 /*
- * unint64_t _rtc_nanotime_read(rtc_nanotime_t *rntp, int slow);
+ * uint64_t _rtc_nanotime_read(rtc_nanotime_t *rntp);
  *
  * This is the same as the commpage nanotime routine, except that it uses the
  * kernel internal "rtc_nanotime_info" data instead of the commpage data.
  * These two copies of data are kept in sync by rtc_clock_napped().
  *
- * Warning!  There is another copy of this code in osfmk/x86_64/idt64.s.
- * These are kept in sync by both using the RTC_NANOTIME_READ() macro.
+ * Warning!  There are several copies of this code in the trampolines found in
+ * osfmk/x86_64/idt64.s, coming from the various TIMER macros in rtclock_asm.h.
+ * They're all kept in sync by using the RTC_NANOTIME_READ() macro.
  *
- * There are two versions of this algorithm, for "slow" and "fast" processors.
- * The more common "fast" algorithm is:
+ * The algorithm we use is:
  *
- *     ns = (((rdtsc - rnt_tsc_base)*rnt_tsc_scale) / 2**32) + rnt_ns_base;
+ *     ns = ((((rdtsc - rnt_tsc_base)<<rnt_shift)*rnt_tsc_scale) / 2**32) + rnt_ns_base;
  *
- * Of course, the divide by 2**32 is a nop.  rnt_tsc_scale is a constant
- * computed during initialization:
+ * rnt_shift, a constant computed during initialization, is the smallest value for which:
  *
- *     rnt_tsc_scale = (10e9 * 2**32) / tscFreq;
+ *     (tscFreq << rnt_shift) > SLOW_TSC_THRESHOLD
  *
- * The "slow" algorithm uses long division:
+ * Where SLOW_TSC_THRESHOLD is about 10e9.  Since most processor's tscFreqs are greater
+ * than 1GHz, rnt_shift is usually 0.  rnt_tsc_scale is also a 32-bit constant:
  *
- *     ns = (((rdtsc - rnt_tsc_base) * 10e9) / tscFreq) + rnt_ns_base;
+ *     rnt_tsc_scale = (10e9 * 2**32) / (tscFreq << rnt_shift);
+ *
+ * On 64-bit processors this algorithm could be simplified by doing a 64x64 bit
+ * multiply of rdtsc by tscFCvtt2n:
+ *
+ *     ns = (((rdtsc - rnt_tsc_base) * tscFCvtt2n) / 2**32) + rnt_ns_base;
+ *
+ * We don't do so in order to use the same algorithm in 32- and 64-bit mode.
+ * When U32 goes away, we should reconsider.
  *
  * Since this routine is not synchronized and can be called in any context, 
  * we use a generation count to guard against seeing partially updated data.
  * the generation is zero.
  *
  * unint64_t _rtc_nanotime_read(
- *                     rtc_nanotime_t *rntp,           // %rdi
- *                     int            slow);           // %rsi
+ *                     rtc_nanotime_t *rntp);          // %rdi
  *
  */
 ENTRY(_rtc_nanotime_read)
-       test            %rsi,%rsi
-       jnz             Lslow
-               
-       /*
-        * Processor whose TSC frequency is faster than SLOW_TSC_THRESHOLD
-        */
+
        PAL_RTC_NANOTIME_READ_FAST()
 
        ret
+    
+/*
+ * extern uint64_t _rtc_tsc_to_nanoseconds(
+ *          uint64_t    value,              // %rdi
+ *          pal_rtc_nanotime_t *rntp);     // %rsi
+ *
+ * Converts TSC units to nanoseconds, using an abbreviated form of the above
+ * algorithm.  Note that while we could have simply used tmrCvt(value,tscFCvtt2n),
+ * which would avoid the need for this asm, doing so is a bit more risky since
+ * we'd be using a different algorithm with possibly different rounding etc.
+ */
 
-       /*
-        * Processor whose TSC frequency is not faster than SLOW_TSC_THRESHOLD
-        * But K64 doesn't support this...
-        */
-Lslow:
-       lea     1f(%rip),%rdi
-       xorb    %al,%al
-       call    EXT(panic)
-       hlt
-       .data
-1:     String  "_rtc_nanotime_read() - slow algorithm not supported"
-       .text
+ENTRY(_rtc_tsc_to_nanoseconds)
+       movq    %rdi,%rax                       /* copy value (in TSC units) to convert */
+       movl    RNT_SHIFT(%rsi),%ecx
+       movl    RNT_SCALE(%rsi),%edx
+       shlq    %cl,%rax                        /* tscUnits << shift */
+       mulq    %rdx                            /* (tscUnits << shift) * scale */
+       shrdq   $32,%rdx,%rax                   /* %rdx:%rax >>= 32 */
+       ret
+    
+    
 
 Entry(call_continuation)
        movq    %rdi,%rcx                       /* get continuation */
 
 void
 pmap_cpu_init(void)
 {
+       cpu_data_t      *cdp = current_cpu_datap();
        /*
         * Here early in the life of a processor (from cpu_mode_init()).
         * Ensure global page feature is disabled at this point.
        /*
         * Initialize the per-cpu, TLB-related fields.
         */
-       current_cpu_datap()->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
-       current_cpu_datap()->cpu_active_cr3 = kernel_pmap->pm_cr3;
-       current_cpu_datap()->cpu_tlb_invalid = FALSE;
-       current_cpu_datap()->cpu_task_map = TASK_MAP_64BIT;
+       cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
+       cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
+       cdp->cpu_tlb_invalid = FALSE;
+       cdp->cpu_task_map = TASK_MAP_64BIT;
        pmap_pcid_configure();
        if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
                boolean_t nsmep;
                        pmap_smep_enabled = TRUE;
                }
        }
+
+       if (cdp->cpu_fixed_pmcs_enabled) {
+               boolean_t enable = TRUE;
+               cpu_pmc_control(&enable);
+       }
 }
 
 
 
 void serial_putc( char c )
 {
     uart_putc(c);
-    if (c == '\n') uart_putc('\r');
 }
 
 int serial_getc( void )