From bd504ef0e0b883cdd7917b73b3574eb9ce669905 Mon Sep 17 00:00:00 2001 From: Apple Date: Wed, 9 Oct 2013 15:39:01 +0000 Subject: [PATCH] xnu-2050.48.11.tar.gz --- bsd/dev/i386/sysctl.c | 28 +- bsd/dev/random/randomdev.c | 47 +- bsd/hfs/hfs_cnode.c | 58 ++- bsd/hfs/hfs_hotfiles.c | 20 +- bsd/hfs/hfs_readwrite.c | 8 +- bsd/kern/kdebug.c | 4 +- bsd/kern/kern_mib.c | 72 ++- bsd/kern/kern_symfile.c | 12 +- bsd/kern/trace.codes | 41 ++ bsd/net/dlil.c | 11 +- bsd/netinet/igmp.c | 8 +- bsd/netinet/in_pcb.c | 172 +++--- bsd/netinet/in_pcb.h | 22 +- bsd/sys/cprotect.h | 3 + bsd/sys/kdebug.h | 4 +- config/MasterVersion | 2 +- config/Private.x86_64.exports | 1 + iokit/IOKit/IOHibernatePrivate.h | 6 +- iokit/IOKit/IOPolledInterface.h | 1 + iokit/IOKit/IOService.h | 5 +- iokit/IOKit/pwr_mgt/IOPM.h | 34 +- iokit/IOKit/pwr_mgt/IOPMPrivate.h | 9 +- iokit/Kernel/IOHibernateIO.cpp | 34 +- iokit/Kernel/IOHibernateInternal.h | 1 + iokit/Kernel/IOHibernateRestoreKernel.c | 36 +- iokit/Kernel/IOMemoryDescriptor.cpp | 19 +- iokit/Kernel/IOPMrootDomain.cpp | 74 ++- iokit/Kernel/IOService.cpp | 7 +- iokit/Kernel/IOServicePM.cpp | 522 ++++++++++++------- iokit/Kernel/IOServicePMPrivate.h | 42 +- kgmacros | 2 +- libkern/c++/OSMetaClass.cpp | 2 + libsyscall/wrappers/__get_cpu_capabilities.s | 7 +- osfmk/conf/files.x86_64 | 1 + osfmk/i386/AT386/model_dep.c | 4 - osfmk/i386/Diagnostics.c | 132 ++++- osfmk/i386/Diagnostics.h | 14 +- osfmk/i386/acpi.c | 117 ++++- osfmk/i386/acpi.h | 1 + osfmk/i386/commpage/commpage.c | 128 +++-- osfmk/i386/commpage/commpage.h | 6 +- osfmk/i386/commpage/fifo_queues.s | 16 +- osfmk/i386/commpage/pthreads.s | 4 +- osfmk/i386/cpu.c | 2 - osfmk/i386/cpu_capabilities.h | 25 +- osfmk/i386/cpu_data.h | 44 +- osfmk/i386/cpu_topology.h | 1 + osfmk/i386/cpuid.c | 36 +- osfmk/i386/cpuid.h | 14 +- osfmk/i386/etimer.c | 6 +- osfmk/i386/fpu.c | 8 + osfmk/i386/genassym.c | 2 - osfmk/i386/hibernate_restore.c | 1 + osfmk/i386/i386_init.c | 7 +- osfmk/i386/lapic.c | 1 - osfmk/i386/lapic.h | 14 +- osfmk/i386/lapic_native.c | 125 +++-- osfmk/i386/machine_check.c | 11 +- osfmk/i386/machine_routines.c | 10 +- osfmk/i386/machine_routines.h | 2 +- osfmk/i386/misc_protos.h | 1 + osfmk/i386/mp.c | 71 ++- osfmk/i386/mp_native.c | 12 +- osfmk/i386/pal_native.h | 6 +- osfmk/i386/pal_routines.h | 2 +- osfmk/i386/pcb.c | 81 ++- osfmk/i386/pmCPU.c | 174 +++---- osfmk/i386/pmCPU.h | 11 + osfmk/i386/proc_reg.h | 22 +- osfmk/i386/rtclock.c | 86 +-- osfmk/i386/rtclock_asm_native.h | 22 +- osfmk/i386/rtclock_protos.h | 7 +- osfmk/i386/trap.c | 7 +- osfmk/i386/tsc.c | 5 +- osfmk/i386/tsc.h | 2 +- osfmk/kern/clock.c | 2 +- osfmk/kern/clock.h | 2 +- osfmk/kern/machine.c | 43 +- osfmk/kern/processor.c | 4 + osfmk/kern/startup.c | 8 +- osfmk/mach/branch_predicates.h | 4 +- osfmk/mach/i386/thread_status.h | 16 +- osfmk/mach/mach_types.defs | 4 +- osfmk/mach/machine.h | 1 + osfmk/mach/thread_status.h | 1 + osfmk/vm/vm_kern.c | 2 +- osfmk/x86_64/Makefile | 12 - osfmk/x86_64/locore.s | 34 ++ osfmk/x86_64/machine_routines_asm.s | 73 +-- osfmk/x86_64/pmap.c | 14 +- pexpert/i386/pe_serial.c | 1 - 91 files changed, 1843 insertions(+), 933 deletions(-) diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index ddc9503e3..f97173075 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -33,10 +33,13 @@ #include #include #include +#include #include #include #include #include +#include + static int _i386_cpu_info SYSCTL_HANDLER_ARGS @@ -730,7 +733,30 @@ SYSCTL_QUAD(_machdep_memmap, OID_AUTO, Other, CTLFLAG_RD|CTLFLAG_LOCKED, &firmwa SYSCTL_NODE(_machdep, OID_AUTO, tsc, CTLFLAG_RD|CTLFLAG_LOCKED, NULL, "Timestamp counter parameters"); -SYSCTL_QUAD(_machdep_tsc, OID_AUTO, frequency, CTLFLAG_RD|CTLFLAG_LOCKED, &tscFreq, ""); +SYSCTL_QUAD(_machdep_tsc, OID_AUTO, frequency, + CTLFLAG_RD|CTLFLAG_LOCKED, &tscFreq, ""); + +extern uint32_t deep_idle_rebase; +SYSCTL_UINT(_machdep_tsc, OID_AUTO, deep_idle_rebase, + CTLFLAG_RW|CTLFLAG_KERN|CTLFLAG_LOCKED, &deep_idle_rebase, 0, ""); + +SYSCTL_NODE(_machdep_tsc, OID_AUTO, nanotime, + CTLFLAG_RD|CTLFLAG_LOCKED, NULL, "TSC to ns conversion"); +SYSCTL_QUAD(_machdep_tsc_nanotime, OID_AUTO, tsc_base, + CTLFLAG_RD | CTLFLAG_LOCKED, + (uint64_t *) &pal_rtc_nanotime_info.tsc_base, ""); +SYSCTL_QUAD(_machdep_tsc_nanotime, OID_AUTO, ns_base, + CTLFLAG_RD | CTLFLAG_LOCKED, + (uint64_t *)&pal_rtc_nanotime_info.ns_base, ""); +SYSCTL_UINT(_machdep_tsc_nanotime, OID_AUTO, scale, + CTLFLAG_RD | CTLFLAG_LOCKED, + (uint32_t *)&pal_rtc_nanotime_info.scale, 0, ""); +SYSCTL_UINT(_machdep_tsc_nanotime, OID_AUTO, shift, + CTLFLAG_RD | CTLFLAG_LOCKED, + (uint32_t *)&pal_rtc_nanotime_info.shift, 0, ""); +SYSCTL_UINT(_machdep_tsc_nanotime, OID_AUTO, generation, + CTLFLAG_RD | CTLFLAG_LOCKED, + (uint32_t *)&pal_rtc_nanotime_info.generation, 0, ""); SYSCTL_NODE(_machdep, OID_AUTO, misc, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Miscellaneous x86 kernel parameters"); diff --git a/bsd/dev/random/randomdev.c b/bsd/dev/random/randomdev.c index c081bd209..c29e9f877 100644 --- a/bsd/dev/random/randomdev.c +++ b/bsd/dev/random/randomdev.c @@ -56,6 +56,7 @@ #include #include +#include #include #include @@ -101,13 +102,14 @@ static struct cdevsw random_cdevsw = /* Used to detect whether we've already been initialized */ -static int gRandomInstalled = 0; +static UInt8 gRandomInstalled = 0; static PrngRef gPrngRef; static int gRandomError = 1; static lck_grp_t *gYarrowGrp; static lck_attr_t *gYarrowAttr; static lck_grp_attr_t *gYarrowGrpAttr; static lck_mtx_t *gYarrowMutex = 0; +static UInt8 gYarrowInitializationLock = 0; #define RESEED_TICKS 50 /* how long a reseed operation can take */ @@ -307,6 +309,27 @@ PreliminarySetup(void) { prng_error_status perr; + /* Multiple threads can enter this as a result of an earlier + * check of gYarrowMutex. We make sure that only one of them + * can enter at a time. If one of them enters and discovers + * that gYarrowMutex is no longer NULL, we know that another + * thread has initialized the Yarrow state and we can exit. + */ + + /* The first thread that enters this function will find + * gYarrowInitializationLock set to 0. It will atomically + * set the value to 1 and, seeing that it was zero, drop + * out of the loop. Other threads will see that the value is + * 1 and continue to loop until we are initialized. + */ + + while (OSTestAndSet(0, &gYarrowInitializationLock)); /* serialize access to this function */ + + if (gYarrowMutex) { + /* we've already been initialized, clear and get out */ + goto function_exit; + } + /* create a Yarrow object */ perr = prngInitialize(&gPrngRef); if (perr != 0) { @@ -321,6 +344,8 @@ PreliminarySetup(void) char buffer [16]; /* get a little non-deterministic data as an initial seed. */ + /* On OSX, securityd will add much more entropy as soon as it */ + /* comes up. On iOS, entropy is added with each system interrupt. */ microtime(&tt); /* @@ -334,7 +359,7 @@ PreliminarySetup(void) if (perr != 0) { /* an error, complain */ printf ("Couldn't seed Yarrow.\n"); - return; + goto function_exit; } /* turn the data around */ @@ -350,6 +375,10 @@ PreliminarySetup(void) gYarrowMutex = lck_mtx_alloc_init(gYarrowGrp, gYarrowAttr); fips_initialize (); + +function_exit: + /* allow other threads to figure out whether or not we have been initialized. */ + gYarrowInitializationLock = 0; } const Block kKnownAnswer = {0x92, 0xb4, 0x04, 0xe5, 0x56, 0x58, 0x8c, 0xed, 0x6c, 0x1a, 0xcd, 0x4e, 0xbf, 0x05, 0x3f, 0x68, 0x09, 0xf7, 0x3a, 0x93}; @@ -384,14 +413,11 @@ random_init(void) { int ret; - if (gRandomInstalled) + if (OSTestAndSet(0, &gRandomInstalled)) { + /* do this atomically so that it works correctly with + multiple threads */ return; - - /* install us in the file system */ - gRandomInstalled = 1; - - /* setup yarrow and the mutex */ - PreliminarySetup(); + } ret = cdevsw_add(RANDOM_MAJOR, &random_cdevsw); if (ret < 0) { @@ -409,6 +435,9 @@ random_init(void) */ devfs_make_node(makedev (ret, 1), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666, "urandom", 0); + + /* setup yarrow and the mutex if needed*/ + PreliminarySetup(); } int diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c index d50734286..65f2825d0 100644 --- a/bsd/hfs/hfs_cnode.c +++ b/bsd/hfs/hfs_cnode.c @@ -312,20 +312,31 @@ int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim) { (cp->c_flag & C_DELETED) && ((forkcount == 1) || (!VNODE_IS_RSRC(vp)))) { - /* Start a transaction here. We're about to change file sizes */ - if (started_tr == 0) { - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - else { - started_tr = 1; - } - } - /* Truncate away our own fork data. (Case A, B, C above) */ if (VTOF(vp)->ff_blocks != 0) { - + + /* + * SYMLINKS only: + * + * Encapsulate the entire change (including truncating the link) in + * nested transactions if we are modifying a symlink, because we know that its + * file length will be at most 4k, and we can fit both the truncation and + * any relevant bitmap changes into a single journal transaction. We also want + * the kill_block code to execute in the same transaction so that any dirty symlink + * blocks will not be written. Otherwise, rely on + * hfs_truncate doing its own transactions to ensure that we don't blow up + * the journal. + */ + if ((started_tr == 0) && (v_type == VLNK)) { + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + else { + started_tr = 1; + } + } + /* * At this point, we have decided that this cnode is * suitable for full removal. We are about to deallocate @@ -348,20 +359,23 @@ int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim) { if (hfsmp->jnl && vnode_islnk(vp)) { buf_iterate(vp, hfs_removefile_callback, BUF_SKIP_NONLOCKED, (void *)hfsmp); } - + /* - * Since we're already inside a transaction, - * tell hfs_truncate to skip the ubc_setsize. - * * This truncate call (and the one below) is fine from VNOP_RECLAIM's * context because we're only removing blocks, not zero-filling new * ones. The C_DELETED check above makes things much simpler. */ - error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 1, 0, ctx); + error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 0, 0, ctx); if (error) { goto out; } truncated = 1; + + /* (SYMLINKS ONLY): Close/End our transaction after truncating the file record */ + if (started_tr) { + hfs_end_transaction(hfsmp); + started_tr = 0; + } } /* @@ -369,7 +383,9 @@ int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim) { * it is the last fork. That means, by definition, the rsrc fork is not in * core. To avoid bringing a vnode into core for the sole purpose of deleting the * data in the resource fork, we call cat_lookup directly, then hfs_release_storage - * to get rid of the resource fork's data. + * to get rid of the resource fork's data. Note that because we are holding the + * cnode lock, it is impossible for a competing thread to create the resource fork + * vnode from underneath us while we do this. * * This is invoked via case A above only. */ @@ -441,12 +457,6 @@ int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim) { */ cp->c_blocks = 0; } - - /* End the transaction from the start of the file truncation segment */ - if (started_tr) { - hfs_end_transaction(hfsmp); - started_tr = 0; - } } /* diff --git a/bsd/hfs/hfs_hotfiles.c b/bsd/hfs/hfs_hotfiles.c index 7ebb82bc6..50a29e223 100644 --- a/bsd/hfs/hfs_hotfiles.c +++ b/bsd/hfs/hfs_hotfiles.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2013 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -207,8 +207,8 @@ hfs_recording_start(struct hfsmount *hfsmp) (SWAP_BE32 (hotfileinfo.timeleft) > 0) && (SWAP_BE32 (hotfileinfo.timebase) > 0)) { hfsmp->hfc_maxfiles = SWAP_BE32 (hotfileinfo.maxfilecnt); - hfsmp->hfc_timeout = SWAP_BE32 (hotfileinfo.timeleft) + tv.tv_sec ; hfsmp->hfc_timebase = SWAP_BE32 (hotfileinfo.timebase); + hfsmp->hfc_timeout = SWAP_BE32 (hotfileinfo.timeleft) + tv.tv_sec ; /* Fix up any bogus timebase values. */ if (hfsmp->hfc_timebase < HFC_MIN_BASE_TIME) { hfsmp->hfc_timebase = hfsmp->hfc_timeout - HFC_DEFAULT_DURATION; @@ -792,7 +792,8 @@ hfs_addhotfile_internal(struct vnode *vp) if (hfsmp->hfc_stage != HFC_RECORDING) return (0); - if ((!vnode_isreg(vp) && !vnode_islnk(vp)) || vnode_issystem(vp)) { + /* Only regular files are allowed for hotfile inclusion ; symlinks disallowed */ + if ((!vnode_isreg(vp)) || vnode_issystem(vp)) { return (0); } /* Skip resource forks for now. */ @@ -862,7 +863,8 @@ hfs_removehotfile(struct vnode *vp) if (hfsmp->hfc_stage != HFC_RECORDING) return (0); - if ((!vnode_isreg(vp) && !vnode_islnk(vp)) || vnode_issystem(vp)) { + /* Only regular files can move out of hotfiles */ + if ((!vnode_isreg(vp)) || vnode_issystem(vp)) { return (0); } @@ -904,7 +906,7 @@ out: static int hotfiles_collect_callback(struct vnode *vp, __unused void *cargs) { - if ((vnode_isreg(vp) || vnode_islnk(vp)) && !vnode_issystem(vp)) + if ((vnode_isreg(vp)) && !vnode_issystem(vp)) (void) hfs_addhotfile_internal(vp); return (VNODE_RETURNED); @@ -1138,7 +1140,9 @@ hotfiles_adopt(struct hfsmount *hfsmp) } break; } - if (!vnode_isreg(vp) && !vnode_islnk(vp)) { + + /* only regular files are eligible */ + if (!vnode_isreg(vp)) { printf("hfs: hotfiles_adopt: huh, not a file %d (%d)\n", listp->hfl_hotfile[i].hf_fileid, VTOC(vp)->c_cnid); hfs_unlock(VTOC(vp)); vnode_put(vp); @@ -1361,7 +1365,9 @@ hotfiles_evict(struct hfsmount *hfsmp, vfs_context_t ctx) } break; } - if (!vnode_isreg(vp) && !vnode_islnk(vp)) { + + /* only regular files are eligible */ + if (!vnode_isreg(vp)) { printf("hfs: hotfiles_evict: huh, not a file %d\n", key->fileID); hfs_unlock(VTOC(vp)); vnode_put(vp); diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index f0b91b94a..b9bcdd036 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2013 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -4391,7 +4391,8 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, enum vtype vnodetype; vnodetype = vnode_vtype(vp); - if (vnodetype != VREG && vnodetype != VLNK) { + if (vnodetype != VREG) { + /* Note symlinks are not allowed to be relocated */ return (EPERM); } @@ -4424,8 +4425,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, if (blockHint == 0) blockHint = hfsmp->nextAllocation; - if ((fp->ff_size > 0x7fffffff) || - ((fp->ff_size > blksize) && vnodetype == VLNK)) { + if ((fp->ff_size > 0x7fffffff)) { return (EFBIG); } diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index b25b3f9d4..79896dbbe 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -2459,13 +2459,15 @@ error_exit: } void -start_kern_tracing(unsigned int new_nkdbufs) { +start_kern_tracing(unsigned int new_nkdbufs, boolean_t need_map) { if (!new_nkdbufs) return; nkdbufs = kdbg_set_nkdbufs(new_nkdbufs); kdbg_lock_init(); kdbg_reinit(TRUE); + if (need_map == TRUE) + kdbg_mapinit(); kdbg_set_tracing_enabled(TRUE, KDEBUG_ENABLE_TRACE); #if defined(__i386__) || defined(__x86_64__) diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index 7c27eb16d..497ab5a44 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -406,35 +406,38 @@ SYSCTL_PROC(_hw, HW_L3SETTINGS, l3settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG SYSCTL_INT (_hw, OID_AUTO, cputhreadtype, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &cputhreadtype, 0, ""); #if defined(__i386__) || defined(__x86_64__) -int mmx_flag = -1; -int sse_flag = -1; -int sse2_flag = -1; -int sse3_flag = -1; -int sse4_1_flag = -1; -int sse4_2_flag = -1; -int x86_64_flag = -1; -int supplementalsse3_flag = -1; -int aes_flag = -1; -int avx1_0_flag = -1; -int rdrand_flag = -1; -int f16c_flag = -1; -int enfstrg_flag = -1; - -SYSCTL_INT(_hw_optional, OID_AUTO, mmx, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &mmx_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse2, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse2_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse3, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse3_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, supplementalsse3, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &supplementalsse3_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse4_1, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse4_1_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse4_2, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse4_2_flag, 0, ""); +static int +sysctl_cpu_capability +(__unused struct sysctl_oid *oidp, void *arg1, __unused int arg2, struct sysctl_req *req) +{ + uint64_t mask = (uint64_t) (uintptr_t) arg1; + boolean_t is_capable = (_get_cpu_capabilities() & mask) != 0; + + return SYSCTL_OUT(req, &is_capable, sizeof(is_capable)); + +} + +SYSCTL_PROC(_hw_optional, OID_AUTO, mmx, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasMMX, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, sse, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSSE, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, sse2, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSSE2, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, sse3, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSSE3, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, supplementalsse3, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSupplementalSSE3, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, sse4_1, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSSE4_1, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, sse4_2, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSSE4_2, 0, sysctl_cpu_capability, "I", ""); /* "x86_64" is actually a preprocessor symbol on the x86_64 kernel, so we have to hack this */ #undef x86_64 -SYSCTL_INT(_hw_optional, OID_AUTO, x86_64, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &x86_64_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, aes, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &aes_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, avx1_0, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &avx1_0_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, rdrand, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &rdrand_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, f16c, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &f16c_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, enfstrg, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &enfstrg_flag, 0, ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, x86_64, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) k64Bit, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, aes, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAES, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, avx1_0, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAVX1_0, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, rdrand, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasRDRAND, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, f16c, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasF16C, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, enfstrg, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasENFSTRG, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, fma, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasFMA, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, avx2_0, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAVX2_0, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, bmi1, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasBMI1, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, bmi2, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasBMI2, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, rtm, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasRTM, 0, sysctl_cpu_capability, "I", ""); +SYSCTL_PROC(_hw_optional, OID_AUTO, hle, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasHLE, 0, sysctl_cpu_capability, "I", ""); #else #error Unsupported arch #endif /* !__i386__ && !__x86_64 && !__arm__ */ @@ -498,21 +501,6 @@ sysctl_mib_init(void) } #if defined (__i386__) || defined (__x86_64__) -#define is_capability_set(k) (((_get_cpu_capabilities() & (k)) == (k)) ? 1 : 0) - mmx_flag = is_capability_set(kHasMMX); - sse_flag = is_capability_set(kHasSSE); - sse2_flag = is_capability_set(kHasSSE2); - sse3_flag = is_capability_set(kHasSSE3); - supplementalsse3_flag = is_capability_set(kHasSupplementalSSE3); - sse4_1_flag = is_capability_set(kHasSSE4_1); - sse4_2_flag = is_capability_set(kHasSSE4_2); - x86_64_flag = is_capability_set(k64Bit); - aes_flag = is_capability_set(kHasAES); - avx1_0_flag = is_capability_set(kHasAVX1_0); - rdrand_flag = is_capability_set(kHasRDRAND); - f16c_flag = is_capability_set(kHasF16C); - enfstrg_flag = is_capability_set(kHasENFSTRG); - /* hw.cpufamily */ cpufamily = cpuid_cpufamily(); diff --git a/bsd/kern/kern_symfile.c b/bsd/kern/kern_symfile.c index 1636b4ab5..2e1965dfd 100644 --- a/bsd/kern/kern_symfile.c +++ b/bsd/kern/kern_symfile.c @@ -203,7 +203,7 @@ kern_open_file_for_direct_io(const char * name, int isssd = 0; uint32_t flags = 0; uint32_t blksize; - off_t maxiocount, count; + off_t maxiocount, count, segcount; boolean_t locked = FALSE; int (*do_ioctl)(void * p1, void * p2, u_long theIoctl, caddr_t result); @@ -406,14 +406,20 @@ kern_open_file_for_direct_io(const char * name, maxiocount = count; error = do_ioctl(p1, p2, DKIOCGETMAXSEGMENTBYTECOUNTREAD, (caddr_t) &count); + if (!error) + error = do_ioctl(p1, p2, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t) &segcount); if (error) - count = 0; + count = segcount = 0; + count *= segcount; if (count && (count < maxiocount)) maxiocount = count; error = do_ioctl(p1, p2, DKIOCGETMAXSEGMENTBYTECOUNTWRITE, (caddr_t) &count); + if (!error) + error = do_ioctl(p1, p2, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t) &segcount); if (error) - count = 0; + count = segcount = 0; + count *= segcount; if (count && (count < maxiocount)) maxiocount = count; diff --git a/bsd/kern/trace.codes b/bsd/kern/trace.codes index af5a27b19..7bfcacf18 100644 --- a/bsd/kern/trace.codes +++ b/bsd/kern/trace.codes @@ -217,6 +217,7 @@ 0x1400058 MACH_SCHED_REDISPATCH 0x140005C MACH_SCHED_REMOTE_AST 0x1400060 MACH_SCHED_LPA_BROKEN +0x1400064 MACH_DEEP_IDLE 0x1500000 MACH_MSGID_INVALID 0x1600000 MTX_SLEEP 0x1600004 MTX_SLEEP_DEADLINE @@ -257,6 +258,14 @@ 0x1700020 PMAP_flush_TLBS 0x1700024 PMAP_update_interrupt 0x1700028 PMAP_attribute_clear +0x1900000 MP_TLB_FLUSH +0x1900004 MP_CPUS_CALL +0x1900008 MP_CPUS_CALL_LOCAL +0x190000c MP_CPUS_CALL_ACTION +0x1900010 MP_CPUS_CALL_NOBUF +0x1900014 MP_CPU_FAST_START +0x1900018 MP_CPU_START +0x190001c MP_CPU_DEACTIVATE 0x2010000 L_IP_In_Beg 0x2010004 L_IP_Out_Beg 0x2010008 L_IP_In_End @@ -1240,6 +1249,38 @@ 0x53101a4 CPUPM_TEST_RUN_INFO 0x53101a8 CPUPM_TEST_SLAVE_INFO 0x53101ac CPUPM_FORCED_IDLE +0x53101b4 CPUPM_PSTATE_CHOOSE +0x53101b8 CPUPM_PSTATE_COMMIT +0x53101bc CPUPM_PSTATE_CHECK +0x531023C CPUPM_TQM +0x5310240 CPUPM_QUIESCE +0x5310244 CPUPM_MBD +0x5310248 CPUPM_PST_RATELIMIT_QOS +0x531024C CPUPM_PST_QOS_RATEUNLIMIT +0x5310250 CPUPM_PST_QOS_SWITCH +0x5310254 CPUPM_FORCED_IDLE +0x531023C CPUPM_TQM +0x5310240 CPUPM_QUIESCE +0x5310244 CPUPM_MBD +0x5310248 CPUPM_PST_RATELIMIT_QOS +0x531024C CPUPM_PST_QOS_RATEUNLIMIT +0x5310250 CPUPM_PST_QOS_SWITCH +0x5310254 CPUPM_FORCED_IDLE +0x5320000 CPUPM_PST_RESOLVE +0x5320004 CPUPM_PST_LOAD_TXFR +0x5320008 CPUPM_PST_IDLE_EXIT +0x532000C CPUPM_PST_IDLE_ENTRY +0x5320010 CPUPM_PST_TIMER +0x5320014 CPUPM_PST_MAXBUS +0x5320018 CPUPM_PST_MAXINT +0x532001C CPUPM_PST_PLIMIT +0x5320020 CPUPM_PST_SELFSEL +0x5320024 CPUPM_PST_RATELIMIT +0x5320028 CPUPM_PST_RATEUNLIMIT +0x532002C CPUPM_DVFS_PAUSE +0x5320030 CPUPM_DVFS_RESUME +0x5320034 CPUPM_DVFS_ADVANCE +0x5320038 CPUPM_DVFS_TRANSIT 0x5330000 HIBERNATE 0x5330004 HIBERNATE_WRITE_IMAGE 0x5330008 HIBERNATE_MACHINE_INIT diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index 888926318..af80a4a40 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -5797,7 +5797,7 @@ ifnet_fc_thread_cont(int err) { #pragma unused(err) struct sfb_bin_fcentry *fce; - struct inp_fc_entry *infc; + struct inpcb *inp; for (;;) { lck_mtx_assert(&ifnet_fclist_lock, LCK_MTX_ASSERT_OWNED); @@ -5813,17 +5813,14 @@ ifnet_fc_thread_cont(int err) SLIST_NEXT(fce, fce_link) = NULL; lck_mtx_unlock(&ifnet_fclist_lock); - infc = inp_fc_getinp(fce->fce_flowhash); - if (infc == NULL) { + inp = inp_fc_getinp(fce->fce_flowhash, 0); + if (inp == NULL) { ifnet_fce_free(fce); lck_mtx_lock_spin(&ifnet_fclist_lock); continue; } - VERIFY(infc->infc_inp != NULL); + inp_fc_feedback(inp); - inp_fc_feedback(infc->infc_inp); - - inp_fc_entry_free(infc); ifnet_fce_free(fce); lck_mtx_lock_spin(&ifnet_fclist_lock); } diff --git a/bsd/netinet/igmp.c b/bsd/netinet/igmp.c index 7e0cd82e9..e8442d135 100644 --- a/bsd/netinet/igmp.c +++ b/bsd/netinet/igmp.c @@ -1604,7 +1604,7 @@ igmp_input(struct mbuf *m, int off) OIGMPSTAT_INC(igps_rcv_tooshort); return; } - VERIFY(IS_P2ALIGNED(igmp, sizeof (u_int32_t))); + /* N.B.: we assume the packet was correctly aligned in ip_input. */ /* * Validate checksum. @@ -1701,8 +1701,10 @@ igmp_input(struct mbuf *m, int off) OIGMPSTAT_INC(igps_rcv_tooshort); return; } - VERIFY(IS_P2ALIGNED(igmpv3, - sizeof (u_int32_t))); + /* + * N.B.: we assume the packet was correctly + * aligned in ip_input. + */ if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { m_freem(m); return; diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index 043057a59..485e8dbcd 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -201,22 +201,22 @@ struct inp_flowhash_key { u_int32_t inp_hash_seed = 0; -static __inline int infc_cmp(const struct inp_fc_entry *, - const struct inp_fc_entry *); +static __inline int infc_cmp(const struct inpcb *, + const struct inpcb *); lck_grp_t *inp_lck_grp; lck_grp_attr_t *inp_lck_grp_attr; lck_attr_t *inp_lck_attr; decl_lck_mtx_data(, inp_fc_lck); -RB_HEAD(inp_fc_tree, inp_fc_entry) inp_fc_tree; -RB_PROTOTYPE(inp_fc_tree, inp_fc_entry, infc_link, infc_cmp); +RB_HEAD(inp_fc_tree, inpcb) inp_fc_tree; +RB_PROTOTYPE(inp_fc_tree, inpcb, infc_link, infc_cmp); +RB_GENERATE(inp_fc_tree, inpcb, infc_link, infc_cmp); -RB_GENERATE(inp_fc_tree, inp_fc_entry, infc_link, infc_cmp); - -static unsigned int inp_fcezone_size; -static struct zone *inp_fcezone; -#define INP_FCEZONE_NAME "inp_fcezone" -#define INP_FCEZONE_MAX 32 +/* + * Use this inp as a key to find an inp in the flowhash tree. + * Accesses to it are protected by inp_fc_lck. + */ +struct inpcb key_inp; /* * in_pcb.c: manage the Protocol Control Blocks. @@ -235,19 +235,10 @@ socket_flowadv_init(void) inp_lck_attr = lck_attr_alloc_init(); lck_mtx_init(&inp_fc_lck, inp_lck_grp, inp_lck_attr); + lck_mtx_lock(&inp_fc_lck); RB_INIT(&inp_fc_tree); - - inp_fcezone_size = P2ROUNDUP(sizeof (struct inp_fc_entry), - sizeof (u_int64_t)); - inp_fcezone = zinit(inp_fcezone_size, - INP_FCEZONE_MAX * inp_fcezone_size, 0, INP_FCEZONE_NAME); - if (inp_fcezone == NULL) { - panic("%s: failed allocating %s", __func__, - INP_FCEZONE_NAME); - /* NOTREACHED */ - } - zone_change(inp_fcezone, Z_EXPAND, TRUE); - zone_change(inp_fcezone, Z_CALLERACCT, FALSE); + bzero(&key_inp, sizeof(key_inp)); + lck_mtx_unlock(&inp_fc_lck); } /* @@ -1721,7 +1712,6 @@ in_pcbrehash(struct inpcb *inp) void in_pcbremlists(struct inpcb *inp) { - struct inp_fc_entry *infce; inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt; if (inp->inp_lport) { @@ -1736,10 +1726,11 @@ in_pcbremlists(struct inpcb *inp) } LIST_REMOVE(inp, inp_list); - infce = inp_fc_getinp(inp->inp_flowhash); - if (infce != NULL) - inp_fc_entry_free(infce); - + if (inp->inp_flags2 & INP2_IN_FCTREE) { + inp_fc_getinp(inp->inp_flowhash, + (INPFC_SOLOCKED|INPFC_REMOVE)); + VERIFY(!(inp->inp_flags2 & INP2_IN_FCTREE)); + } inp->inp_pcbinfo->ipi_count--; } @@ -2031,6 +2022,7 @@ inp_calc_flowhash(struct inpcb *inp) { struct inp_flowhash_key fh __attribute__((aligned(8))); u_int32_t flowhash = 0; + struct inpcb *tmp_inp = NULL; if (inp_hash_seed == 0) inp_hash_seed = RandomULong(); @@ -2055,102 +2047,69 @@ try_again: goto try_again; } - return flowhash; -} + inp->inp_flowhash = flowhash; -/* - * Function to compare inp_fc_entries in inp flow control tree - */ -static inline int -infc_cmp(const struct inp_fc_entry *fc1, const struct inp_fc_entry *fc2) -{ - return (fc1->infc_flowhash - fc2->infc_flowhash); -} - -int -inp_fc_addinp(struct inpcb *inp) -{ - struct inp_fc_entry keyfc, *infc; - u_int32_t flowhash = inp->inp_flowhash; - - keyfc.infc_flowhash = flowhash; - - lck_mtx_lock_spin(&inp_fc_lck); - infc = RB_FIND(inp_fc_tree, &inp_fc_tree, &keyfc); - if (infc != NULL && infc->infc_inp == inp) { - /* Entry is already in inp_fc_tree, return */ - lck_mtx_unlock(&inp_fc_lck); - return (1); - } + /* Insert the inp into inp_fc_tree */ - if (infc != NULL) { + lck_mtx_lock(&inp_fc_lck); + tmp_inp = RB_FIND(inp_fc_tree, &inp_fc_tree, inp); + if (tmp_inp != NULL) { /* - * There is a different fc entry with the same - * flow hash but different inp pointer. There - * can be a collision on flow hash but the - * probability is low. Let's just avoid - * adding a second one when there is a collision + * There is a different inp with the same flowhash. + * There can be a collision on flow hash but the + * probability is low. Let's recompute the + * flowhash. */ lck_mtx_unlock(&inp_fc_lck); - return (0); - } - - /* become regular mutex */ - lck_mtx_convert_spin(&inp_fc_lck); - - infc = zalloc_noblock(inp_fcezone); - if (infc == NULL) { - /* memory allocation failed */ - lck_mtx_unlock(&inp_fc_lck); - return (0); + /* recompute hash seed */ + inp_hash_seed = RandomULong(); + goto try_again; } - bzero(infc, sizeof (*infc)); - - infc->infc_flowhash = flowhash; - infc->infc_inp = inp; - - RB_INSERT(inp_fc_tree, &inp_fc_tree, infc); + RB_INSERT(inp_fc_tree, &inp_fc_tree, inp); + inp->inp_flags2 |= INP2_IN_FCTREE; lck_mtx_unlock(&inp_fc_lck); - return (1); + + return flowhash; } -struct inp_fc_entry* -inp_fc_getinp(u_int32_t flowhash) +/* + * Function to compare inp_fc_entries in inp flow control tree + */ +static inline int +infc_cmp(const struct inpcb *inp1, const struct inpcb *inp2) { - struct inp_fc_entry keyfc, *infc; + return (memcmp(&(inp1->inp_flowhash), &(inp2->inp_flowhash), + sizeof(inp1->inp_flowhash))); +} - keyfc.infc_flowhash = flowhash; +struct inpcb * +inp_fc_getinp(u_int32_t flowhash, u_int32_t flags) +{ + struct inpcb *inp = NULL; + int locked = (flags & INPFC_SOLOCKED) ? 1 : 0; lck_mtx_lock_spin(&inp_fc_lck); - infc = RB_FIND(inp_fc_tree, &inp_fc_tree, &keyfc); - if (infc == NULL) { + key_inp.inp_flowhash = flowhash; + inp = RB_FIND(inp_fc_tree, &inp_fc_tree, &key_inp); + if (inp == NULL) { /* inp is not present, return */ lck_mtx_unlock(&inp_fc_lck); return (NULL); } - RB_REMOVE(inp_fc_tree, &inp_fc_tree, infc); - - if (in_pcb_checkstate(infc->infc_inp, WNT_ACQUIRE, 0) == - WNT_STOPUSING) { - /* become regular mutex */ - lck_mtx_convert_spin(&inp_fc_lck); + if (flags & INPFC_REMOVE) { + RB_REMOVE(inp_fc_tree, &inp_fc_tree, inp); + lck_mtx_unlock(&inp_fc_lck); - /* - * This inp is going away, just don't process it. - */ - inp_fc_entry_free(infc); - infc = NULL; + bzero(&(inp->infc_link), sizeof (inp->infc_link)); + inp->inp_flags2 &= ~INP2_IN_FCTREE; + return (NULL); } + if (in_pcb_checkstate(inp, WNT_ACQUIRE, locked) == WNT_STOPUSING) + inp = NULL; lck_mtx_unlock(&inp_fc_lck); - return (infc); -} - -void -inp_fc_entry_free(struct inp_fc_entry *infc) -{ - zfree(inp_fcezone, infc); + return (inp); } void @@ -2209,6 +2168,7 @@ inp_reset_fc_state(struct inpcb *inp) int inp_set_fc_state(struct inpcb *inp, int advcode) { + struct inpcb *tmp_inp = NULL; /* * If there was a feedback from the interface when * send operation was in progress, we should ignore @@ -2220,7 +2180,12 @@ inp_set_fc_state(struct inpcb *inp, int advcode) return(0); inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED); - if (inp_fc_addinp(inp)) { + if ((tmp_inp = inp_fc_getinp(inp->inp_flowhash, INPFC_SOLOCKED)) + != NULL) { + if (in_pcb_checkstate(tmp_inp, WNT_RELEASE, 1) + == WNT_STOPUSING) + return (0); + VERIFY(tmp_inp == inp); switch (advcode) { case FADV_FLOW_CONTROLLED: inp->inp_flags |= INP_FLOW_CONTROLLED; @@ -2234,8 +2199,9 @@ inp_set_fc_state(struct inpcb *inp, int advcode) inp->inp_socket->so_flags |= SOF_SUSPENDED; break; } + return (1); } - return(1); + return(0); } /* diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index 63dddb8fd..acd942f63 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -124,15 +124,6 @@ struct label; #endif struct ifnet; -#ifdef BSD_KERNEL_PRIVATE -/* Flow control entry per socket */ -struct inp_fc_entry { - RB_ENTRY(inp_fc_entry) infc_link; - u_int32_t infc_flowhash; - struct inpcb *infc_inp; -}; -#endif /* BSD_KERNEL_PRIVATE */ - struct inp_stat { u_int64_t rxpackets; u_int64_t rxbytes; @@ -153,9 +144,11 @@ struct inpcb { struct socket *inp_socket; /* back pointer to socket */ u_int32_t nat_cookie; /* Cookie stored and returned to NAT */ LIST_ENTRY(inpcb) inp_portlist; /* list for this PCB's local port */ + RB_ENTRY(inpcb) infc_link; /* link for flowhash RB tree */ struct inpcbport *inp_phd; /* head of this list */ inp_gen_t inp_gencnt; /* generation count of this instance */ u_int32_t inp_flags; /* generic IP/datagram flags */ + u_int32_t inp_flags2; /* generic IP/datagram flags #2 */ u_int32_t inp_flow; u_char inp_sndinprog_cnt; /* outstanding send operations */ @@ -611,6 +604,9 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define IN6P_RECV_ANYIF INP_RECV_ANYIF #define IN6P_CONTROLOPTS INP_CONTROLOPTS #define IN6P_NO_IFT_CELLULAR INP_NO_IFT_CELLULAR + +/* Overflowed INP flags; use INP2 prefix to avoid misuse */ +#define INP2_IN_FCTREE 0x2 /* in inp_fc_tree */ /* * socket AF version is {newer than,or include} * actual datagram AF version @@ -702,9 +698,11 @@ extern int inp_bindif(struct inpcb *, unsigned int); extern int inp_nocellular(struct inpcb *, unsigned int); extern u_int32_t inp_calc_flowhash(struct inpcb *); extern void socket_flowadv_init(void); -extern int inp_fc_addinp(struct inpcb *); -extern struct inp_fc_entry *inp_fc_getinp(u_int32_t); -extern void inp_fc_entry_free(struct inp_fc_entry *); + +/* Flags used by inp_fc_getinp */ +#define INPFC_SOLOCKED 0x1 +#define INPFC_REMOVE 0x2 +extern struct inpcb *inp_fc_getinp(u_int32_t, u_int32_t); extern void inp_fc_feedback(struct inpcb *); extern void inp_reset_fc_state(struct inpcb *); extern int inp_set_fc_state(struct inpcb *, int advcode); diff --git a/bsd/sys/cprotect.h b/bsd/sys/cprotect.h index 0dda075ac..eb0a134fd 100644 --- a/bsd/sys/cprotect.h +++ b/bsd/sys/cprotect.h @@ -65,6 +65,9 @@ extern "C" { #define CP_READ_ACCESS 0x1 #define CP_WRITE_ACCESS 0x2 +/* + * Check for this version when deciding to enable features + */ #define CONTENT_PROTECTION_XATTR_NAME "com.apple.system.cprotect" #define CP_NEW_MAJOR_VERS 4 #define CP_PREV_MAJOR_VERS 2 diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index ae8970aa4..9a76fbc80 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -150,6 +150,7 @@ __BEGIN_DECLS #define MACH_REMOTE_AST 0x17 /* AST signal issued to remote processor */ #define MACH_SCHED_LPA_BROKEN 0x18 /* last_processor affinity broken in choose_processor */ +#define MACH_DEEP_IDLE 0x19 /* deep idle on master processor */ /* Codes for pmap (DBG_MACH_PMAP) */ #define PMAP__CREATE 0x0 @@ -250,6 +251,7 @@ __BEGIN_DECLS #define DBG_DRVSD 19 /* Secure Digital */ #define DBG_DRVNAND 20 /* NAND drivers and layers */ #define DBG_SSD 21 /* SSD */ +#define DBG_DRVSPI 22 /* SPI */ /* Backwards compatibility */ #define DBG_DRVPOINTING DBG_DRVHID /* OBSOLETE: Use DBG_DRVHID instead */ @@ -539,7 +541,7 @@ extern void kdbg_trace_data(struct proc *proc, long *arg_pid); extern void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *arg4); extern void kdbg_dump_trace_to_file(const char *); -void start_kern_tracing(unsigned int); +void start_kern_tracing(unsigned int, boolean_t); struct task; extern void kdbg_get_task_name(char*, int, struct task *task); void disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags); diff --git a/config/MasterVersion b/config/MasterVersion index 80a99e0df..bf00b8e5c 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -12.4.0 +12.5.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/Private.x86_64.exports b/config/Private.x86_64.exports index 73963f2c2..62dbcc51c 100644 --- a/config/Private.x86_64.exports +++ b/config/Private.x86_64.exports @@ -5,6 +5,7 @@ _SHA256_Update __ZN22IOInterruptEventSource7warmCPUEy _acpi_install_wake_handler _acpi_sleep_kernel +_acpi_idle_kernel _add_fsevent _apic_table _apply_func_phys diff --git a/iokit/IOKit/IOHibernatePrivate.h b/iokit/IOKit/IOHibernatePrivate.h index da180376c..36d98707c 100644 --- a/iokit/IOKit/IOHibernatePrivate.h +++ b/iokit/IOKit/IOHibernatePrivate.h @@ -102,7 +102,11 @@ struct IOHibernateImageHeader uint32_t sleepTime; uint32_t compression; - uint32_t reserved[68]; // make sizeof == 512 + uint32_t reserved[62]; // make sizeof == 512 + + uint64_t restoreTime1 __attribute__ ((packed)); + uint64_t restoreTime2 __attribute__ ((packed)); + uint64_t restoreTime3 __attribute__ ((packed)); uint64_t encryptEnd __attribute__ ((packed)); uint64_t deviceBase __attribute__ ((packed)); diff --git a/iokit/IOKit/IOPolledInterface.h b/iokit/IOKit/IOPolledInterface.h index 09ef735d9..ec500eae2 100644 --- a/iokit/IOKit/IOPolledInterface.h +++ b/iokit/IOKit/IOPolledInterface.h @@ -32,6 +32,7 @@ #include #define kIOPolledInterfaceSupportKey "IOPolledInterface" +#define kIOPolledInterfaceActiveKey "IOPolledInterfaceActive" enum { diff --git a/iokit/IOKit/IOService.h b/iokit/IOKit/IOService.h index f3d1bed4a..94a230651 100644 --- a/iokit/IOKit/IOService.h +++ b/iokit/IOKit/IOService.h @@ -1785,7 +1785,7 @@ private: void ParentChangeNotifyInterestedDriversDidChange ( void ); void ParentChangeTellCapabilityDidChange ( void ); void ParentChangeAcknowledgePowerChange ( void ); - void ParentChangeCancelIdleTimer( IOPMPowerStateIndex ); + void ParentChangeRootChangeDown( void ); void all_done ( void ); void start_ack_timer ( void ); @@ -1793,9 +1793,10 @@ private: void startSettleTimer( void ); bool checkForDone ( void ); bool responseValid ( uint32_t x, int pid ); - void computeDesiredState ( unsigned long tempDesire = 0 ); + void computeDesiredState( unsigned long tempDesire, bool computeOnly ); void trackSystemSleepPreventers( IOPMPowerStateIndex, IOPMPowerStateIndex, IOPMPowerChangeFlags ); void tellSystemCapabilityChange( uint32_t nextMS ); + void restartIdleTimer( void ); static void ack_timer_expired( thread_call_param_t, thread_call_param_t ); static IOReturn actionAckTimerExpired(OSObject *, void *, void *, void *, void * ); diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h index c2003ef47..f01c84178 100644 --- a/iokit/IOKit/pwr_mgt/IOPM.h +++ b/iokit/IOKit/pwr_mgt/IOPM.h @@ -90,6 +90,13 @@ enum { @constant kIOPMInitialDeviceState Indicates the initial power state for the device. If initialPowerStateForDomainState() returns a power state with this flag set in the capability field, then the initial power change is performed without calling the driver's setPowerState(). + + @constant kIOPMRootDomainState + An indication that the power flags represent the state of the root power + domain. This bit must not be set in the IOPMPowerState structure. + Power Management may pass this bit to initialPowerStateForDomainState() + or powerStateForDomainState() to map from a global system state to the + desired device state. */ typedef unsigned long IOPMPowerFlags; enum { @@ -101,7 +108,8 @@ enum { kIOPMRestartCapability = 0x00000080, kIOPMSleep = 0x00000001, kIOPMRestart = 0x00000080, - kIOPMInitialDeviceState = 0x00000100 + kIOPMInitialDeviceState = 0x00000100, + kIOPMRootDomainState = 0x00000200 }; /* @@ -247,6 +255,30 @@ enum { */ #define kIOPMDestroyFVKeyOnStandbyKey "DestroyFVKeyOnStandby" +/******************************************************************************* + * + * Properties that can control power management behavior + * + ******************************************************************************/ + +/* kIOPMResetPowerStateOnWakeKey + * If an IOService publishes this key with the value of kOSBooleanTrue, + * then PM will disregard the influence from changePowerStateToPriv() or + * any activity tickles that occurred before system sleep when resolving + * the initial device power state on wake. Influences from power children + * and changePowerStateTo() are not eliminated. At the earliest opportunity + * upon system wake, PM will query the driver for a new power state to be + * installed as the initial changePowerStateToPriv() influence, by calling + * initialPowerStateForDomainState() with both kIOPMRootDomainState and + * kIOPMPowerOn flags set. The default implementation will always return + * the lowest power state. Drivers can override this default behavior to + * immediately raise the power state when there are work blocked on the + * power change, and cannot afford to wait until the next activity tickle. + * This property should be statically added to a driver's plist or set at + * runtime before calling PMinit(). + */ +#define kIOPMResetPowerStateOnWakeKey "IOPMResetPowerStateOnWake" + /******************************************************************************* * * Driver PM Assertions diff --git a/iokit/IOKit/pwr_mgt/IOPMPrivate.h b/iokit/IOKit/pwr_mgt/IOPMPrivate.h index 6ff802dde..55f86b475 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPrivate.h +++ b/iokit/IOKit/pwr_mgt/IOPMPrivate.h @@ -660,6 +660,12 @@ enum { */ #define kIOPMUserWakeAlarmScheduledKey "UserWakeAlarmScheduled" +/* kIOPMDeepIdleSupportedKey + * Presence of this key indicates Deep Idle is supported on this platform. + * Key will always refer to a value of kOSBooleanTrue. + */ +#define kIOPMDeepIdleSupportedKey "IOPMDeepIdleSupported" + /***************************************************************************** * * System Sleep Policy @@ -743,7 +749,8 @@ enum { kIOPMSleepTypeHibernate = 4, kIOPMSleepTypeStandby = 5, kIOPMSleepTypePowerOff = 6, - kIOPMSleepTypeLast = 7 + kIOPMSleepTypeDeepIdle = 7, + kIOPMSleepTypeLast = 8 }; // System Sleep Flags diff --git a/iokit/Kernel/IOHibernateIO.cpp b/iokit/Kernel/IOHibernateIO.cpp index 74b18022e..034ca65ca 100644 --- a/iokit/Kernel/IOHibernateIO.cpp +++ b/iokit/Kernel/IOHibernateIO.cpp @@ -774,6 +774,14 @@ IOPolledFileOpen( const char * filename, uint64_t setFileSize, if (kIOReturnSuccess != err) break; + vars->media = part; + next = part; + while (next) + { + next->setProperty(kIOPolledInterfaceActiveKey, kOSBooleanTrue); + next = next->getParentEntry(gIOServicePlane); + } + *fileVars = vars; *fileExtents = extentsData; @@ -1781,6 +1789,8 @@ IOHibernateSystemWake(void) static IOReturn IOHibernateDone(IOHibernateVars * vars) { + IORegistryEntry * next; + hibernate_teardown(vars->page_list, vars->page_list_wired, vars->page_list_pal); if (vars->videoMapping) @@ -1821,9 +1831,14 @@ IOHibernateDone(IOHibernateVars * vars) IOService::getPMRootDomain()->removeProperty(kIOHibernateGfxStatusKey); } - if (vars->fileVars) { + if ((next = vars->fileVars->media)) do + { + next->removeProperty(kIOPolledInterfaceActiveKey); + next = next->getParentEntry(gIOServicePlane); + } + while (next); IOPolledFileClose(vars->fileVars); } @@ -2716,6 +2731,12 @@ hibernate_machine_init(void) gIOHibernateCurrentHeader->diag[0], gIOHibernateCurrentHeader->diag[1], gIOHibernateCurrentHeader->diag[2], gIOHibernateCurrentHeader->diag[3]); + HIBLOG("restore times %qd, %qd, %qd ms, tsc 0x%qx scale 0x%x\n", + (((gIOHibernateCurrentHeader->restoreTime1 * pal_rtc_nanotime_info.scale) >> 32) / 1000000), + (((gIOHibernateCurrentHeader->restoreTime2 * pal_rtc_nanotime_info.scale) >> 32) / 1000000), + (((gIOHibernateCurrentHeader->restoreTime3 * pal_rtc_nanotime_info.scale) >> 32) / 1000000), + gIOHibernateCurrentHeader->restoreTime1, pal_rtc_nanotime_info.scale); + if ((kIOHibernateModeDiscardCleanActive | kIOHibernateModeDiscardCleanInactive) & gIOHibernateMode) hibernate_page_list_discard(vars->page_list); @@ -2756,8 +2777,19 @@ hibernate_machine_init(void) break; case kIOHibernateHandoffTypeMemoryMap: + + clock_get_uptime(&allTime); + hibernate_newruntime_map(data, handoff->bytecount, gIOHibernateCurrentHeader->systemTableOffset); + + clock_get_uptime(&endTime); + + SUB_ABSOLUTETIME(&endTime, &allTime); + absolutetime_to_nanoseconds(endTime, &nsec); + + HIBLOG("hibernate_newruntime_map time: %qd ms, ", nsec / 1000000ULL); + break; case kIOHibernateHandoffTypeDeviceTree: diff --git a/iokit/Kernel/IOHibernateInternal.h b/iokit/Kernel/IOHibernateInternal.h index 6e044eb57..23f875d18 100644 --- a/iokit/Kernel/IOHibernateInternal.h +++ b/iokit/Kernel/IOHibernateInternal.h @@ -64,6 +64,7 @@ typedef struct IOHibernateVars IOHibernateVars; struct IOPolledFileIOVars { struct kern_direct_file_io_ref_t * fileRef; + IORegistryEntry * media; class OSArray * pollers; IOByteCount blockSize; uint8_t * buffer; diff --git a/iokit/Kernel/IOHibernateRestoreKernel.c b/iokit/Kernel/IOHibernateRestoreKernel.c index 10bd705f5..b45b2acd3 100644 --- a/iokit/Kernel/IOHibernateRestoreKernel.c +++ b/iokit/Kernel/IOHibernateRestoreKernel.c @@ -37,9 +37,7 @@ #include #include "IOHibernateInternal.h" -#if defined(__i386__) || defined(__x86_64__) -#include -#endif +#include /* This code is linked into the kernel but part of the "__HIB" section, which means @@ -76,6 +74,27 @@ extern void acpi_wake_prot_entry(void); #if defined(__i386__) || defined(__x86_64__) +#define rdtsc(lo,hi) \ + __asm__ volatile("lfence; rdtsc; lfence" : "=a" (lo), "=d" (hi)) + +static inline uint64_t rdtsc64(void) +{ + uint64_t lo, hi; + rdtsc(lo, hi); + return ((hi) << 32) | (lo); +} + +#else + +static inline uint64_t rdtsc64(void) +{ + return (0); +} + +#endif /* defined(__i386__) || defined(__x86_64__) */ + +#if defined(__i386__) || defined(__x86_64__) + #define DBGLOG 1 #include @@ -441,6 +460,9 @@ hibernate_kernel_entrypoint(uint32_t p1, uint32_t handoffPages; uint32_t handoffPageCount; + uint64_t timeStart, time; + timeStart = rdtsc64(); + C_ASSERT(sizeof(IOHibernateImageHeader) == 512); headerPhys = ptoa_64(p1); @@ -604,8 +626,10 @@ hibernate_kernel_entrypoint(uint32_t p1, if (!conflicts) { // if (compressedSize) + time = rdtsc64(); pageSum = store_one_page(gIOHibernateCurrentHeader->processorFlags, src, compressedSize, 0, ppnum); + gIOHibernateCurrentHeader->restoreTime2 += (rdtsc64() - time); if (stage != 2) sum += pageSum; uncompressedPages++; @@ -658,6 +682,8 @@ hibernate_kernel_entrypoint(uint32_t p1, // -- copy back conflicts + time = rdtsc64(); + pageListPage = copyPageListHeadPage; while (pageListPage) { @@ -681,6 +707,8 @@ hibernate_kernel_entrypoint(uint32_t p1, pal_hib_patchup(); + gIOHibernateCurrentHeader->restoreTime3 = (rdtsc64() - time); + // -- image has been destroyed... gIOHibernateCurrentHeader->actualImage1Sum = sum; @@ -690,6 +718,8 @@ hibernate_kernel_entrypoint(uint32_t p1, gIOHibernateState = kIOHibernateStateWakingFromHibernate; + gIOHibernateCurrentHeader->restoreTime1 = (rdtsc64() - timeStart); + #if CONFIG_SLEEP #if defined(__i386__) || defined(__x86_64__) typedef void (*ResetProc)(void); diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp index aa24637cb..8d49aeebe 100644 --- a/iokit/Kernel/IOMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMemoryDescriptor.cpp @@ -906,14 +906,21 @@ void IOGeneralMemoryDescriptor::free() reserved->dp.memory = 0; UNLOCK; } - - if ((kIOMemoryTypePhysical != type) && (kIOMemoryTypePhysical64 != type)) + if ((kIOMemoryTypePhysical == type) || (kIOMemoryTypePhysical64 == type)) + { + ioGMDData * dataP; + if (_memoryEntries && (dataP = getDataP(_memoryEntries)) && dataP->fMappedBase) + { + dataP->fMapper->iovmFree(atop_64(dataP->fMappedBase), _pages); + dataP->fMappedBase = 0; + } + } + else { - while (_wireCount) - complete(); + while (_wireCount) complete(); } - if (_memoryEntries) - _memoryEntries->release(); + + if (_memoryEntries) _memoryEntries->release(); if (_ranges.v && !(kIOMemoryAsReference & _flags)) { diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index b3d6e1f44..742b1df58 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -2535,12 +2535,6 @@ bool IOPMrootDomain::tellChangeDown( unsigned long stateNum ) IOService::updateConsoleUsers(NULL, kIOMessageSystemWillSleep); - // Notify platform that sleep has begun - getPlatform()->callPlatformFunction( - sleepMessagePEFunction, false, - (void *)(uintptr_t) kIOMessageSystemWillSleep, - NULL, NULL, NULL); - // Two change downs are sent by IOServicePM. Ignore the 2nd. // But tellClientsWithResponse() must be called for both. ignoreTellChangeDown = true; @@ -2746,6 +2740,13 @@ IOReturn IOPMrootDomain::sysPowerDownHandler( DLOG("sysPowerDownHandler timeout %d s\n", (int) (params->maxWaitForReply / 1000 / 1000)); #endif + // Notify platform that sleep has begun, after the early + // sleep policy evaluation. + getPlatform()->callPlatformFunction( + sleepMessagePEFunction, false, + (void *)(uintptr_t) kIOMessageSystemWillSleep, + NULL, NULL, NULL); + if ( !OSCompareAndSwap( 0, 1, &gSleepOrShutdownPending ) ) { // Purposely delay the ack and hope that shutdown occurs quickly. @@ -3683,6 +3684,32 @@ struct IOPMSystemSleepPolicyTable IOPMSystemSleepPolicyEntry entries[]; } __attribute__((packed)); +enum { + kIOPMSleepAttributeHibernateSetup = 0x00000001, + kIOPMSleepAttributeHibernateSleep = 0x00000002 +}; + +static uint32_t +getSleepTypeAttributes( uint32_t sleepType ) +{ + static const uint32_t sleepTypeAttributes[ kIOPMSleepTypeLast ] = + { + /* invalid */ 0, + /* abort */ 0, + /* normal */ 0, + /* safesleep */ kIOPMSleepAttributeHibernateSetup, + /* hibernate */ kIOPMSleepAttributeHibernateSetup | kIOPMSleepAttributeHibernateSleep, + /* standby */ kIOPMSleepAttributeHibernateSetup | kIOPMSleepAttributeHibernateSleep, + /* poweroff */ kIOPMSleepAttributeHibernateSetup | kIOPMSleepAttributeHibernateSleep, + /* deepidle */ 0 + }; + + if (sleepType >= kIOPMSleepTypeLast) + return 0; + + return sleepTypeAttributes[sleepType]; +} + bool IOPMrootDomain::evaluateSystemSleepPolicy( IOPMSystemSleepParameters * params, int sleepPhase, uint32_t * hibMode ) { @@ -3829,7 +3856,8 @@ bool IOPMrootDomain::evaluateSystemSleepPolicy( goto done; } - if ((params->sleepType >= kIOPMSleepTypeSafeSleep) && + if ((getSleepTypeAttributes(params->sleepType) & + kIOPMSleepAttributeHibernateSetup) && ((*hibMode & kIOHibernateModeOn) == 0)) { *hibMode |= (kIOHibernateModeOn | kIOHibernateModeSleep); @@ -3951,9 +3979,10 @@ void IOPMrootDomain::evaluateSystemSleepPolicyEarly( void ) &hibernateMode)) { if (!hibernateNoDefeat && - (gEarlySystemSleepParams.sleepType == kIOPMSleepTypeNormalSleep)) + ((getSleepTypeAttributes(gEarlySystemSleepParams.sleepType) & + kIOPMSleepAttributeHibernateSetup) == 0)) { - // Disable hibernate setup for normal sleep + // skip hibernate setup hibernateDisabled = true; } } @@ -3991,7 +4020,8 @@ void IOPMrootDomain::evaluateSystemSleepPolicyFinal( void ) if (evaluateSystemSleepPolicy(¶ms, kIOPMSleepPhase2, &hibernateMode)) { if ((hibernateDisabled || hibernateAborted) && - (params.sleepType != kIOPMSleepTypeNormalSleep)) + (getSleepTypeAttributes(params.sleepType) & + kIOPMSleepAttributeHibernateSetup)) { // Final evaluation picked a state requiring hibernation, // but hibernate setup was skipped. Retry using the early @@ -4016,9 +4046,10 @@ void IOPMrootDomain::evaluateSystemSleepPolicyFinal( void ) paramsData->release(); } - if (params.sleepType >= kIOPMSleepTypeHibernate) + if (getSleepTypeAttributes(params.sleepType) & + kIOPMSleepAttributeHibernateSleep) { - // Disable safe sleep to force the hibernate path + // Disable sleep to force hibernation gIOHibernateMode &= ~kIOHibernateModeSleep; } } @@ -4410,8 +4441,7 @@ void IOPMrootDomain::overrideOurPowerChange( uint32_t changeFlags = *inOutChangeFlags; uint32_t currentPowerState = (uint32_t) getPowerState(); - if ((currentPowerState == powerState) || - (changeFlags & kIOPMParentInitiated)) + if (changeFlags & kIOPMParentInitiated) { // FIXME: cancel any parent change (unexpected) // Root parent is permanently pegged at max power, @@ -4453,6 +4483,20 @@ void IOPMrootDomain::overrideOurPowerChange( // Revert device desire from SLEEP->ON. changePowerStateToPriv(ON_STATE); } + else + { + // Broadcast power down + *inOutChangeFlags |= kIOPMRootChangeDown; + } + } + else if (powerState > currentPowerState) + { + if ((_currentCapability & kIOPMSystemCapabilityCPU) == 0) + { + // Broadcast power up when waking from sleep, but not for the + // initial power change at boot by checking for cpu capability. + *inOutChangeFlags |= kIOPMRootChangeUp; + } } } @@ -6079,7 +6123,7 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) if ( minutesToIdleSleep > minutesToDisplayDim ) minutesDelta = minutesToIdleSleep - minutesToDisplayDim; - else if( minutesToIdleSleep == minutesToDisplayDim ) + else if( minutesToIdleSleep <= minutesToDisplayDim ) minutesDelta = 1; if ((sleepSlider == 0) && (minutesToIdleSleep != 0)) diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index 0eabf83f9..f35c6b6d8 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -3108,9 +3108,11 @@ void IOService::doServiceMatch( IOOptionBits options ) if( matches) { lockForArbitration(); - if( 0 == (__state[0] & kIOServiceFirstPublishState)) + if( 0 == (__state[0] & kIOServiceFirstPublishState)) { + getMetaClass()->addInstance(this); deliverNotification( gIOFirstPublishNotification, kIOServiceFirstPublishState, 0xffffffff ); + } LOCKREADNOTIFY(); __state[1] &= ~kIOServiceNeedConfigState; __state[1] |= kIOServiceConfigState; @@ -3134,9 +3136,6 @@ void IOService::doServiceMatch( IOOptionBits options ) } UNLOCKNOTIFY(); - if (didRegister) { - getMetaClass()->addInstance(this); - } unlockForArbitration(); if (keepGuessing && matches->getCount() && (kIOReturnSuccess == getResources())) diff --git a/iokit/Kernel/IOServicePM.cpp b/iokit/Kernel/IOServicePM.cpp index 97711e1c8..4d35ed45b 100644 --- a/iokit/Kernel/IOServicePM.cpp +++ b/iokit/Kernel/IOServicePM.cpp @@ -83,17 +83,18 @@ OSDefineMetaClassAndStructors( PMEventDetails, OSObject ); // Globals //****************************************************************************** -static bool gIOPMInitialized = false; -static uint32_t gIOPMBusyCount = 0; -static uint32_t gIOPMWorkCount = 0; -static IOWorkLoop * gIOPMWorkLoop = 0; -static IOPMRequestQueue * gIOPMRequestQueue = 0; -static IOPMRequestQueue * gIOPMReplyQueue = 0; -static IOPMWorkQueue * gIOPMWorkQueue = 0; -static IOPMCompletionQueue * gIOPMFreeQueue = 0; -static IOPMRequest * gIOPMRequest = 0; -static IOService * gIOPMRootNode = 0; -static IOPlatformExpert * gPlatform = 0; +static bool gIOPMInitialized = false; +static uint32_t gIOPMBusyCount = 0; +static uint32_t gIOPMWorkCount = 0; +static uint32_t gIOPMTickleGeneration = 0; +static IOWorkLoop * gIOPMWorkLoop = 0; +static IOPMRequestQueue * gIOPMRequestQueue = 0; +static IOPMRequestQueue * gIOPMReplyQueue = 0; +static IOPMWorkQueue * gIOPMWorkQueue = 0; +static IOPMCompletionQueue * gIOPMFreeQueue = 0; +static IOPMRequest * gIOPMRequest = 0; +static IOService * gIOPMRootNode = 0; +static IOPlatformExpert * gPlatform = 0; static const OSSymbol * gIOPMPowerClientDevice = 0; static const OSSymbol * gIOPMPowerClientDriver = 0; @@ -555,6 +556,10 @@ void IOService::PMinit ( void ) gIOPMRootNode = this; fParentsKnowState = true; } + else if (getProperty(kIOPMResetPowerStateOnWakeKey) == kOSBooleanTrue) + { + fResetPowerStateOnWake = true; + } fAckTimer = thread_call_allocate( &IOService::ack_timer_expired, (thread_call_param_t)this); @@ -852,7 +857,9 @@ void IOService::handlePMstop ( IOPMRequest * request ) PM_UNLOCK(); } - // Tell idleTimerExpired() to ignore idle timer. + // Clear idle period to prevent idleTimerExpired() from servicing + // idle timer expirations. + fIdleTimerPeriod = 0; if (fIdleTimer && thread_call_cancel(fIdleTimer)) release(); @@ -1667,12 +1674,12 @@ IOReturn IOService::acknowledgeSetPowerState ( void ) void IOService::adjustPowerState ( uint32_t clamp ) { PM_ASSERT_IN_GATE(); - computeDesiredState(clamp); + computeDesiredState(clamp, false); if (fControllingDriver && fParentsKnowState && inPlane(gIOPowerPlane)) { IOPMPowerChangeFlags changeFlags = kIOPMSelfInitiated; - // Indicate that children desires were ignored, and do not ask + // Indicate that children desires must be ignored, and do not ask // apps for permission to drop power. This is used by root domain // for demand sleep. @@ -1793,7 +1800,7 @@ void IOService::handlePowerDomainWillChangeTo ( IOPMRequest * request ) OSIterator * iter; OSObject * next; IOPowerConnection * connection; - IOPMPowerStateIndex newPowerState; + IOPMPowerStateIndex maxPowerState; IOPMPowerFlags combinedPowerFlags; bool savedParentsKnowState; IOReturn result = IOPMAckImplied; @@ -1834,16 +1841,20 @@ void IOService::handlePowerDomainWillChangeTo ( IOPMRequest * request ) if ( fControllingDriver && !fInitialPowerChange ) { - newPowerState = fControllingDriver->maxCapabilityForDomainState( + maxPowerState = fControllingDriver->maxCapabilityForDomainState( combinedPowerFlags); - // Absorb parent's kIOPMSynchronize flag. + // Use kIOPMSynchronize below instead of kIOPMRootBroadcastFlags + // to avoid propagating the root change flags if any service must + // change power state due to root's will-change notification. + // Root does not change power state for kIOPMSynchronize. + myChangeFlags = kIOPMParentInitiated | kIOPMDomainWillChange | (parentChangeFlags & kIOPMSynchronize); result = startPowerChange( /* flags */ myChangeFlags, - /* power state */ newPowerState, + /* power state */ maxPowerState, /* domain flags */ combinedPowerFlags, /* connection */ whichParent, /* parent flags */ parentPowerFlags); @@ -1909,8 +1920,10 @@ void IOService::handlePowerDomainDidChangeTo ( IOPMRequest * request ) IOPowerConnection * whichParent = (IOPowerConnection *) request->fArg1; IOPMPowerChangeFlags parentChangeFlags = (IOPMPowerChangeFlags)(uintptr_t) request->fArg2; IOPMPowerChangeFlags myChangeFlags; - IOPMPowerStateIndex newPowerState; - IOPMPowerStateIndex initialDesire; + IOPMPowerStateIndex maxPowerState; + IOPMPowerStateIndex initialDesire = 0; + bool computeDesire = false; + bool desireChanged = false; bool savedParentsKnowState; IOReturn result = IOPMAckImplied; @@ -1929,29 +1942,63 @@ void IOService::handlePowerDomainDidChangeTo ( IOPMRequest * request ) if ( fControllingDriver ) { - newPowerState = fControllingDriver->maxCapabilityForDomainState( + maxPowerState = fControllingDriver->maxCapabilityForDomainState( fParentsCurrentPowerFlags); if (fInitialPowerChange) { + computeDesire = true; initialDesire = fControllingDriver->initialPowerStateForDomainState( - fParentsCurrentPowerFlags); - computeDesiredState(initialDesire); + fParentsCurrentPowerFlags); } - else if (fAdvisoryTickleUsed && (newPowerState > 0) && - ((parentChangeFlags & kIOPMSynchronize) == 0)) + else if (parentChangeFlags & kIOPMRootChangeUp) { - // re-compute desired state in case advisory tickle was enabled - computeDesiredState(); + if (fAdvisoryTickleUsed) + { + // On system wake, re-compute the desired power state since + // gIOPMAdvisoryTickleEnabled will change for a full wake, + // which is an input to computeDesiredState(). This is not + // necessary for a dark wake because powerChangeDone() will + // handle the dark to full wake case, but it does no harm. + + desireChanged = true; + } + + if (fResetPowerStateOnWake) + { + // Query the driver for the desired power state on system wake. + // Default implementation returns the lowest power state. + + IOPMPowerStateIndex wakePowerState = + fControllingDriver->initialPowerStateForDomainState( + kIOPMRootDomainState | kIOPMPowerOn ); + + // fDesiredPowerState was adjusted before going to sleep + // with fDeviceDesire at min. + + if (wakePowerState > fDesiredPowerState) + { + // Must schedule a power adjustment if we changed the + // device desire. That will update the desired domain + // power on the parent power connection and ping the + // power parent if necessary. + + updatePowerClient(gIOPMPowerClientDevice, wakePowerState); + desireChanged = true; + } + } } - // Absorb parent's kIOPMSynchronize flag. + if (computeDesire || desireChanged) + computeDesiredState(initialDesire, false); + + // Absorb and propagate parent's broadcast flags myChangeFlags = kIOPMParentInitiated | kIOPMDomainDidChange | - (parentChangeFlags & kIOPMSynchronize); + (parentChangeFlags & kIOPMRootBroadcastFlags); result = startPowerChange( /* flags */ myChangeFlags, - /* power state */ newPowerState, + /* power state */ maxPowerState, /* domain flags */ fParentsCurrentPowerFlags, /* connection */ whichParent, /* parent flags */ 0); @@ -1974,12 +2021,13 @@ void IOService::handlePowerDomainDidChangeTo ( IOPMRequest * request ) } // If the parent registers its power driver late, then this is the - // first opportunity to tell our parent about our desire. + // first opportunity to tell our parent about our desire. Or if the + // child's desire changed during a parent change notify. - if (!savedParentsKnowState && fParentsKnowState) + if ((!savedParentsKnowState && fParentsKnowState) || desireChanged) { - PM_LOG1("%s::powerDomainDidChangeTo parentsKnowState = true\n", - getName()); + PM_LOG1("%s::powerDomainDidChangeTo parentsKnowState %d\n", + getName(), fParentsKnowState); requestDomainPower( fDesiredPowerState ); } @@ -2057,10 +2105,10 @@ void IOService::trackSystemSleepPreventers( { IOPMRequest * cancelRequest; - cancelRequest = acquirePMRequest( this, kIOPMRequestTypeIdleCancel ); + cancelRequest = acquirePMRequest( getPMRootDomain(), kIOPMRequestTypeIdleCancel ); if (cancelRequest) { - getPMRootDomain()->submitPMRequest( cancelRequest ); + submitPMRequest( cancelRequest ); } } #endif @@ -2538,7 +2586,7 @@ void IOService::handlePowerOverrideChanged ( IOPMRequest * request ) // [private] computeDesiredState //********************************************************************************* -void IOService::computeDesiredState ( unsigned long localClamp ) +void IOService::computeDesiredState( unsigned long localClamp, bool computeOnly ) { OSIterator * iter; OSObject * next; @@ -2603,6 +2651,7 @@ void IOService::computeDesiredState ( unsigned long localClamp ) if (hasChildren && (client == gIOPMPowerClientChildProxy)) continue; + // Advisory tickles are irrelevant unless system is in full wake if (client == gIOPMPowerClientAdvisoryTickle && !gIOPMAdvisoryTickleEnabled) continue; @@ -2640,37 +2689,30 @@ void IOService::computeDesiredState ( unsigned long localClamp ) (uint32_t) localClamp, (uint32_t) fTempClampPowerState, (uint32_t) fCurrentPowerState, newPowerState); - // Restart idle timer if stopped and device desire has increased. - // Or if advisory desire exists. - - if (fIdleTimerStopped) + if (!computeOnly) { - if (fDeviceDesire > 0) - { - fIdleTimerStopped = false; - fActivityTickleCount = 0; - clock_get_uptime(&fIdleTimerStartTime); - start_PM_idle_timer(); - } - else if (fHasAdvisoryDesire) + // Restart idle timer if possible when device desire has increased. + // Or if an advisory desire exists. + + if (fIdleTimerPeriod && fIdleTimerStopped) { - fIdleTimerStopped = false; - start_PM_idle_timer(); + restartIdleTimer(); } - } - // Invalidate cached tickle power state when desires change, and not - // due to a tickle request. This invalidation must occur before the - // power state change to minimize races. We want to err on the side - // of servicing more activity tickles rather than dropping one when - // the device is in a low power state. + // Invalidate cached tickle power state when desires change, and not + // due to a tickle request. In case the driver has requested a lower + // power state, but the tickle is caching a higher power state which + // will drop future tickles until the cached value is lowered or in- + // validated. The invalidation must occur before the power transition + // to avoid dropping a necessary tickle. - if ((getPMRequestType() != kIOPMRequestTypeActivityTickle) && - (fActivityTicklePowerState != kInvalidTicklePowerState)) - { - IOLockLock(fActivityLock); - fActivityTicklePowerState = kInvalidTicklePowerState; - IOLockUnlock(fActivityLock); + if ((getPMRequestType() != kIOPMRequestTypeActivityTickle) && + (fActivityTicklePowerState != kInvalidTicklePowerState)) + { + IOLockLock(fActivityLock); + fActivityTicklePowerState = kInvalidTicklePowerState; + IOLockUnlock(fActivityLock); + } } } @@ -2795,6 +2837,7 @@ bool IOService::activityTickle ( unsigned long type, unsigned long stateNumber ) { IOPMRequest * request; bool noPowerChange = true; + uint32_t tickleFlags; if (!initialized) return true; // no power change @@ -2820,12 +2863,13 @@ bool IOService::activityTickle ( unsigned long type, unsigned long stateNumber ) fActivityTicklePowerState = stateNumber; noPowerChange = false; + tickleFlags = kTickleTypeActivity | kTickleTypePowerRise; request = acquirePMRequest( this, kIOPMRequestTypeActivityTickle ); if (request) { - request->fArg0 = (void *) stateNumber; // power state - request->fArg1 = (void *) true; // power rise - request->fArg2 = (void *) false; // regular tickle + request->fArg0 = (void *) stateNumber; + request->fArg1 = (void *) tickleFlags; + request->fArg2 = (void *) gIOPMTickleGeneration; submitPMRequest(request); } } @@ -2845,12 +2889,13 @@ bool IOService::activityTickle ( unsigned long type, unsigned long stateNumber ) fAdvisoryTicklePowerState = stateNumber; noPowerChange = false; + tickleFlags = kTickleTypeAdvisory | kTickleTypePowerRise; request = acquirePMRequest( this, kIOPMRequestTypeActivityTickle ); if (request) { - request->fArg0 = (void *) stateNumber; // power state - request->fArg1 = (void *) true; // power rise - request->fArg2 = (void *) true; // advisory tickle + request->fArg0 = (void *) stateNumber; + request->fArg1 = (void *) tickleFlags; + request->fArg2 = (void *) gIOPMTickleGeneration; submitPMRequest(request); } } @@ -2871,14 +2916,26 @@ bool IOService::activityTickle ( unsigned long type, unsigned long stateNumber ) void IOService::handleActivityTickle ( IOPMRequest * request ) { uint32_t ticklePowerState = (uint32_t)(uintptr_t) request->fArg0; - bool deviceWasActive = (request->fArg1 == (void *) true); - bool isRegularTickle = (request->fArg2 == (void *) false); + uint32_t tickleFlags = (uint32_t)(uintptr_t) request->fArg1; + uint32_t tickleGeneration = (uint32_t)(uintptr_t) request->fArg2; bool adjustPower = false; PM_ASSERT_IN_GATE(); - if (isRegularTickle) + if (fResetPowerStateOnWake && (tickleGeneration != gIOPMTickleGeneration)) + { + // Drivers that don't want power restored on wake will drop any + // tickles that pre-dates the current system wake. The model is + // that each wake is a fresh start, with power state depressed + // until a new tickle or an explicit power up request from the + // driver. It is possible for the PM work loop to enter the + // system sleep path with tickle requests queued. + + return; + } + + if (tickleFlags & kTickleTypeActivity) { - if (deviceWasActive) + if (tickleFlags & kTickleTypePowerRise) { if ((ticklePowerState > fDeviceDesire) && (ticklePowerState < fNumberOfPowerStates)) @@ -2904,7 +2961,7 @@ void IOService::handleActivityTickle ( IOPMRequest * request ) } else // advisory tickle { - if (deviceWasActive) + if (tickleFlags & kTickleTypePowerRise) { if ((ticklePowerState == fDeviceUsablePowerState) && (ticklePowerState < fNumberOfPowerStates)) @@ -3054,6 +3111,30 @@ void IOService::start_PM_idle_timer ( void ) if (pending) release(); } +//********************************************************************************* +// [private] restartIdleTimer +//********************************************************************************* + +void IOService::restartIdleTimer( void ) +{ + if (fDeviceDesire != 0) + { + fIdleTimerStopped = false; + fActivityTickleCount = 0; + clock_get_uptime(&fIdleTimerStartTime); + start_PM_idle_timer(); + } + else if (fHasAdvisoryDesire) + { + fIdleTimerStopped = false; + start_PM_idle_timer(); + } + else + { + fIdleTimerStopped = true; + } +} + //********************************************************************************* // idle_timer_expired //********************************************************************************* @@ -3085,8 +3166,10 @@ void IOService::idleTimerExpired( void ) { IOPMRequest * request; bool restartTimer = true; + uint32_t tickleFlags; - if ( !initialized || !fIdleTimerPeriod || fLockedFlags.PMStop ) + if ( !initialized || !fIdleTimerPeriod || fIdleTimerStopped || + fLockedFlags.PMStop ) return; IOLockLock(fActivityLock); @@ -3108,12 +3191,13 @@ void IOService::idleTimerExpired( void ) if (fActivityTicklePowerState > 0) fActivityTicklePowerState--; + tickleFlags = kTickleTypeActivity | kTickleTypePowerDrop; request = acquirePMRequest( this, kIOPMRequestTypeActivityTickle ); if (request) { - request->fArg0 = (void *) 0; // power state (irrelevant) - request->fArg1 = (void *) false; // timer expiration (not tickle) - request->fArg2 = (void *) false; // regular tickle + request->fArg0 = (void *) 0; // irrelevant + request->fArg1 = (void *) tickleFlags; + request->fArg2 = (void *) gIOPMTickleGeneration; submitPMRequest( request ); // Do not restart timer until after the tickle request has been @@ -3132,12 +3216,13 @@ void IOService::idleTimerExpired( void ) // Want new tickles to turn into pm request after we drop the lock fAdvisoryTicklePowerState = kInvalidTicklePowerState; + tickleFlags = kTickleTypeAdvisory | kTickleTypePowerDrop; request = acquirePMRequest( this, kIOPMRequestTypeActivityTickle ); if (request) { - request->fArg0 = (void *) 0; // power state (irrelevant) - request->fArg1 = (void *) false; // timer expiration (not tickle) - request->fArg2 = (void *) true; // advisory tickle + request->fArg0 = (void *) 0; // irrelevant + request->fArg1 = (void *) tickleFlags; + request->fArg2 = (void *) gIOPMTickleGeneration; submitPMRequest( request ); // Do not restart timer until after the tickle request has been @@ -4098,8 +4183,13 @@ void IOService::all_done ( void ) } else if (fAdvisoryTickleUsed) { - // Not root domain and advisory tickle target + // Not root domain and advisory tickle target. // Re-adjust power after power tree sync at the 'did' pass + // to recompute desire and adjust power state between dark + // and full wake transitions. Root domain is responsible + // for calling setAdvisoryTickleEnable() before starting + // the kIOPMSynchronize power change. + if (!fAdjustPowerScheduled && (fHeadNoteChangeFlags & kIOPMDomainDidChange)) { @@ -4150,6 +4240,12 @@ void IOService::all_done ( void ) if (fCurrentCapabilityFlags & kIOPMStaticPowerValid) fCurrentPowerConsumption = powerStatePtr->staticPower; + if (fHeadNoteChangeFlags & kIOPMRootChangeDown) + { + // Bump tickle generation count once the entire tree is down + gIOPMTickleGeneration++; + } + // inform subclass policy-maker if (fPCDFunctionOverride && fParentsKnowState && assertPMDriverCall(&callEntry, kIOPMADC_NoInactiveCheck)) @@ -4168,6 +4264,9 @@ void IOService::all_done ( void ) // parent's power change if ( fHeadNoteChangeFlags & kIOPMParentInitiated) { + if (fHeadNoteChangeFlags & kIOPMRootChangeDown) + ParentChangeRootChangeDown(); + if (((fHeadNoteChangeFlags & kIOPMDomainWillChange) && (fCurrentPowerState >= fHeadNotePowerState)) || ((fHeadNoteChangeFlags & kIOPMDomainDidChange) && @@ -4307,6 +4406,10 @@ void IOService::OurChangeStart ( void ) } } +//********************************************************************************* +// [private] requestDomainPowerApplier +// +// Call requestPowerDomainState() on all power parents. //********************************************************************************* struct IOPMRequestDomainPowerContext { @@ -4345,6 +4448,10 @@ requestDomainPowerApplier( //********************************************************************************* // [private] requestDomainPower +// +// Called by a power child to broadcast its desired power state to all parents. +// If the child self-initiates a power change, it must call this function to +// allow its parents to adjust power state. //********************************************************************************* IOReturn IOService::requestDomainPower( @@ -4362,7 +4469,7 @@ IOReturn IOService::requestDomainPower( if (IS_PM_ROOT) return kIOReturnSuccess; - // Fetch the input power flags for the requested power state. + // Fetch our input power flags for the requested power state. // Parent request is stated in terms of required power flags. requestPowerFlags = fPowerStates[ourPowerState].inputPowerFlags; @@ -4377,6 +4484,7 @@ IOReturn IOService::requestDomainPower( } fPreviousRequestPowerFlags = requestPowerFlags; + // The results will be collected by fHeadNoteDomainTargetFlags context.child = this; context.requestPowerFlags = requestPowerFlags; fHeadNoteDomainTargetFlags = 0; @@ -4387,7 +4495,7 @@ IOReturn IOService::requestDomainPower( maxPowerState = fControllingDriver->maxCapabilityForDomainState( fHeadNoteDomainTargetFlags ); - if (maxPowerState < fHeadNotePowerState) + if (maxPowerState < ourPowerState) { PM_LOG1("%s: power desired %u:0x%x got %u:0x%x\n", getName(), @@ -4600,16 +4708,20 @@ IOReturn IOService::ParentChangeStart ( void ) PM_ASSERT_IN_GATE(); OUR_PMLog( kPMLogStartParentChange, fHeadNotePowerState, fCurrentPowerState ); - // Power domain is lowering power - if ( fHeadNotePowerState < fCurrentPowerState ) + // Root power domain has transitioned to its max power state + if ((fHeadNoteChangeFlags & (kIOPMDomainDidChange | kIOPMRootChangeUp)) == + (kIOPMDomainDidChange | kIOPMRootChangeUp)) { - // Piggy-back idle timer cancellation on a parent down - if (0 == fHeadNotePowerState) - ParentChangeCancelIdleTimer(fHeadNotePowerState); - - // TODO: redundant? See handlePowerDomainWillChangeTo() - setParentInfo( fHeadNoteParentFlags, fHeadNoteParentConnection, true ); + // Restart the idle timer stopped by ParentChangeRootChangeDown() + if (fIdleTimerPeriod && fIdleTimerStopped) + { + restartIdleTimer(); + } + } + // Power domain is forcing us to lower power + if ( fHeadNotePowerState < fCurrentPowerState ) + { PM_ACTION_2(actionPowerChangeStart, fHeadNotePowerState, &fHeadNoteChangeFlags); // Tell apps and kernel clients @@ -4651,10 +4763,10 @@ IOReturn IOService::ParentChangeStart ( void ) ParentChangeTellCapabilityWillChange(); return IOPMWillAckLater; } - else if (fHeadNoteChangeFlags & kIOPMSynchronize) + else if (fHeadNoteChangeFlags & kIOPMRootBroadcastFlags) { - // We do not need to change power state, but notify - // children to propagate tree synchronization. + // No need to change power state, but broadcast change + // to our children. fMachineState = kIOPM_SyncNotifyDidChange; fDriverCallReason = kDriverCallInformPreChange; notifyChildren(); @@ -4666,6 +4778,103 @@ IOReturn IOService::ParentChangeStart ( void ) return IOPMAckImplied; } +//****************************************************************************** +// [private] ParentChangeRootChangeDown +// +// Root domain has finished the transition to the system sleep state. And all +// drivers in the power plane should have powered down. Cancel the idle timer, +// and also reset the device desire for those drivers that don't want power +// automatically restored on wake. +//****************************************************************************** + +void IOService::ParentChangeRootChangeDown( void ) +{ + // Always stop the idle timer before root power down + if (fIdleTimerPeriod && !fIdleTimerStopped) + { + fIdleTimerStopped = true; + if (fIdleTimer && thread_call_cancel(fIdleTimer)) + release(); + } + + if (fResetPowerStateOnWake) + { + // Reset device desire down to the lowest power state. + // Advisory tickle desire is intentionally untouched since + // it has no effect until system is promoted to full wake. + + if (fDeviceDesire != 0) + { + updatePowerClient(gIOPMPowerClientDevice, 0); + computeDesiredState(0, true); + PM_LOG1("%s: tickle desire removed\n", fName); + } + + // Invalidate tickle cache so the next tickle will issue a request + IOLockLock(fActivityLock); + fDeviceWasActive = false; + fActivityTicklePowerState = kInvalidTicklePowerState; + IOLockUnlock(fActivityLock); + + fIdleTimerMinPowerState = 0; + } + else if (fAdvisoryTickleUsed) + { + // Less aggressive mechanism to accelerate idle timer expiration + // before system sleep. May not always allow the driver to wake + // up from system sleep in the min power state. + + AbsoluteTime now; + uint64_t nsec; + bool dropTickleDesire = false; + + if (fIdleTimerPeriod && !fIdleTimerIgnored && + (fIdleTimerMinPowerState == 0) && + (fDeviceDesire != 0)) + { + IOLockLock(fActivityLock); + + if (!fDeviceWasActive) + { + // No tickles since the last idle timer expiration. + // Safe to drop the device desire to zero. + dropTickleDesire = true; + } + else + { + // Was tickled since the last idle timer expiration, + // but not in the last minute. + clock_get_uptime(&now); + SUB_ABSOLUTETIME(&now, &fDeviceActiveTimestamp); + absolutetime_to_nanoseconds(now, &nsec); + if (nsec >= kNoTickleCancelWindow) + { + dropTickleDesire = true; + } + } + + if (dropTickleDesire) + { + // Force the next tickle to raise power state + fDeviceWasActive = false; + fActivityTicklePowerState = kInvalidTicklePowerState; + } + + IOLockUnlock(fActivityLock); + } + + if (dropTickleDesire) + { + // Advisory tickle desire is intentionally untouched since + // it has no effect until system is promoted to full wake. + + updatePowerClient(gIOPMPowerClientDevice, 0); + computeDesiredState(0, true); + PM_LOG1("%s: tickle desire dropped\n", fName); + } + } +} + //********************************************************************************* // [private] ParentChangeTellPriorityClientsPowerDown // @@ -4785,72 +4994,6 @@ void IOService::ParentChangeAcknowledgePowerChange ( void ) nub->release(); } -void IOService::ParentChangeCancelIdleTimer( IOPMPowerStateIndex newPowerState ) -{ - AbsoluteTime now; - uint64_t nsec; - bool cancel = false; - - // No ready or idle timer not in use - if (!initialized || !fIdleTimerPeriod || fLockedFlags.PMStop || - !fAdvisoryTickleUsed) - return; - - // Not allowed to induce artifical idle timeout - if (fIdleTimerIgnored || fIdleTimerMinPowerState) - goto done; - - // Idle timer already has no influence - if (!fDesiredPowerState || fIdleTimerStopped) - goto done; - - IOLockLock(fActivityLock); - - if (!fDeviceWasActive) - { - // No tickles since the last idle timer expiration. - // Safe to drop the device desire to zero. - cancel = true; - } - else - { - // Was tickled since the last idle timer expiration, - // but not in the last minute. - clock_get_uptime(&now); - SUB_ABSOLUTETIME(&now, &fDeviceActiveTimestamp); - absolutetime_to_nanoseconds(now, &nsec); - if (nsec >= kNoTickleCancelWindow) - { - cancel = true; - } - } - - if (cancel) - { - // Force the next tickle to raise power state - fActivityTicklePowerState = kInvalidTicklePowerState; - fDeviceWasActive = false; - } - - IOLockUnlock(fActivityLock); - - if (cancel) - { - // cancel idle timer - if (fIdleTimer && thread_call_cancel(fIdleTimer)) - release(); - - updatePowerClient(gIOPMPowerClientDevice, 0); - computeDesiredState(); - - fIdleTimerStopped = true; - } - -done: - OUR_PMLog( kPMLogStartParentChange, fHeadNotePowerState, fCurrentPowerState ); - PM_LOG("%s::%s cancel=%d\n", fName, __FUNCTION__, cancel); -} - // MARK: - // MARK: Ack and Settle timers @@ -4895,6 +5038,12 @@ settle_timer_expired( thread_call_param_t arg0, thread_call_param_t arg1 ) void IOService::startSettleTimer( void ) { +#if NOT_USEFUL + // This function is broken and serves no useful purpose since it never + // updates fSettleTimeUS to a non-zero value to stall the state machine, + // yet it starts a delay timer. It appears no driver relies on a delay + // from settleUpTime and settleDownTime in the power state table. + AbsoluteTime deadline; IOPMPowerStateIndex i; uint32_t settleTime = 0; @@ -4931,6 +5080,7 @@ void IOService::startSettleTimer( void ) pending = thread_call_enter_delayed(fSettleTimer, deadline); if (pending) release(); } +#endif } //********************************************************************************* @@ -6337,6 +6487,12 @@ unsigned long IOService::initialPowerStateForDomainState ( IOPMPowerFlags domain { int i; + if (fResetPowerStateOnWake && (domainState & kIOPMRootDomainState)) + { + // Return lowest power state for any root power domain changes + return 0; + } + if (fNumberOfPowerStates == 0 ) { return 0; @@ -6606,26 +6762,10 @@ bool IOService::retirePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) // Catch requests created by idleTimerExpired(). if ((request->getType() == kIOPMRequestTypeActivityTickle) && - (request->fArg1 == (void *) false)) + (((uintptr_t) request->fArg1) & kTickleTypePowerDrop) && + fIdleTimerPeriod) { - // Idle timer expiration - power drop request completed. - // Restart the idle timer if deviceDesire can go lower, otherwise set - // a flag so we know to restart idle timer when fDeviceDesire > 0. - - if (fDeviceDesire > 0) - { - fActivityTickleCount = 0; - clock_get_uptime(&fIdleTimerStartTime); - start_PM_idle_timer(); - } - else if (fHasAdvisoryDesire) - { - start_PM_idle_timer(); - } - else - { - fIdleTimerStopped = true; - } + restartIdleTimer(); } // If the request is linked, then Work queue has already incremented its @@ -6946,9 +7086,14 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) fIsPreChange = false; if (fHeadNoteChangeFlags & kIOPMParentInitiated) + { fMachineState = kIOPM_SyncFinish; + } else + { + assert(IS_ROOT_DOMAIN); fMachineState = kIOPM_SyncTellCapabilityDidChange; + } fDriverCallReason = kDriverCallInformPostChange; notifyChildren(); @@ -7068,13 +7213,8 @@ void IOService::executePMRequest( IOPMRequest * request ) case kIOPMRequestTypeSetIdleTimerPeriod: { fIdleTimerPeriod = (uintptr_t) request->fArg0; - if ((false == fLockedFlags.PMStop) && (fIdleTimerPeriod > 0)) - { - fActivityTickleCount = 0; - clock_get_uptime(&fIdleTimerStartTime); - start_PM_idle_timer(); - } + restartIdleTimer(); } break; @@ -7425,10 +7565,12 @@ void IOPMRequest::reset( void ) fType = kIOPMRequestTypeInvalid; +#if NOT_READY if (fCompletionAction) { fCompletionAction(fCompletionTarget, fCompletionParam, fCompletionStatus); } +#endif if (fTarget) { @@ -7448,7 +7590,7 @@ bool IOPMRequest::attachNextRequest( IOPMRequest * next ) fRequestNext = next; fRequestNext->fWorkWaitCount++; #if LOG_REQUEST_ATTACH - kprintf("Attached next: %p [0x%x] -> %p [0x%x, %u] %s\n", + PM_LOG("Attached next: %p [0x%x] -> %p [0x%x, %u] %s\n", this, (uint32_t) fType, fRequestNext, (uint32_t) fRequestNext->fType, (uint32_t) fRequestNext->fWorkWaitCount, @@ -7469,7 +7611,7 @@ bool IOPMRequest::detachNextRequest( void ) if (fRequestNext->fWorkWaitCount) fRequestNext->fWorkWaitCount--; #if LOG_REQUEST_ATTACH - kprintf("Detached next: %p [0x%x] -> %p [0x%x, %u] %s\n", + PM_LOG("Detached next: %p [0x%x] -> %p [0x%x, %u] %s\n", this, (uint32_t) fType, fRequestNext, (uint32_t) fRequestNext->fType, (uint32_t) fRequestNext->fWorkWaitCount, @@ -7492,7 +7634,7 @@ bool IOPMRequest::attachRootRequest( IOPMRequest * root ) fRequestRoot = root; fRequestRoot->fFreeWaitCount++; #if LOG_REQUEST_ATTACH - kprintf("Attached root: %p [0x%x] -> %p [0x%x, %u] %s\n", + PM_LOG("Attached root: %p [0x%x] -> %p [0x%x, %u] %s\n", this, (uint32_t) fType, fRequestRoot, (uint32_t) fRequestRoot->fType, (uint32_t) fRequestRoot->fFreeWaitCount, @@ -7513,7 +7655,7 @@ bool IOPMRequest::detachRootRequest( void ) if (fRequestRoot->fFreeWaitCount) fRequestRoot->fFreeWaitCount--; #if LOG_REQUEST_ATTACH - kprintf("Detached root: %p [0x%x] -> %p [0x%x, %u] %s\n", + PM_LOG("Detached root: %p [0x%x] -> %p [0x%x, %u] %s\n", this, (uint32_t) fType, fRequestRoot, (uint32_t) fRequestRoot->fType, (uint32_t) fRequestRoot->fFreeWaitCount, diff --git a/iokit/Kernel/IOServicePMPrivate.h b/iokit/Kernel/IOServicePMPrivate.h index 47f99ea45..8651e6af8 100644 --- a/iokit/Kernel/IOServicePMPrivate.h +++ b/iokit/Kernel/IOServicePMPrivate.h @@ -229,20 +229,22 @@ private: // PM state lock. IOLock * PMLock; - unsigned int InitialPowerChange:1; - unsigned int InitialSetPowerState:1; - unsigned int DeviceOverrideEnabled:1; - unsigned int DoNotPowerDown:1; - unsigned int ParentsKnowState:1; - unsigned int StrictTreeOrder:1; - unsigned int IdleTimerStopped:1; - unsigned int AdjustPowerScheduled:1; - unsigned int IsPreChange:1; - unsigned int DriverCallBusy:1; - unsigned int PCDFunctionOverride:1; - unsigned int IdleTimerIgnored:1; - unsigned int HasAdvisoryDesire:1; - unsigned int AdvisoryTickleUsed:1; + unsigned int InitialPowerChange :1; + unsigned int InitialSetPowerState :1; + unsigned int DeviceOverrideEnabled :1; + unsigned int DoNotPowerDown :1; + unsigned int ParentsKnowState :1; + unsigned int StrictTreeOrder :1; + unsigned int IdleTimerStopped :1; + unsigned int AdjustPowerScheduled :1; + + unsigned int IsPreChange :1; + unsigned int DriverCallBusy :1; + unsigned int PCDFunctionOverride :1; + unsigned int IdleTimerIgnored :1; + unsigned int HasAdvisoryDesire :1; + unsigned int AdvisoryTickleUsed :1; + unsigned int ResetPowerStateOnWake :1; // Time of last device activity. AbsoluteTime DeviceActiveTimestamp; @@ -384,6 +386,7 @@ private: #define fIdleTimerIgnored pwrMgt->IdleTimerIgnored #define fHasAdvisoryDesire pwrMgt->HasAdvisoryDesire #define fAdvisoryTickleUsed pwrMgt->AdvisoryTickleUsed +#define fResetPowerStateOnWake pwrMgt->ResetPowerStateOnWake #define fDeviceActiveTimestamp pwrMgt->DeviceActiveTimestamp #define fActivityLock pwrMgt->ActivityLock #define fIdleTimerPeriod pwrMgt->IdleTimerPeriod @@ -464,6 +467,17 @@ the ack timer is ticking every tenth of a second. #define kIOPMSyncTellPowerDown 0x0400 // send the ask/will power off messages #define kIOPMSyncCancelPowerDown 0x0800 // sleep cancel for maintenance wake #define kIOPMInitialPowerChange 0x1000 // set for initial power change +#define kIOPMRootChangeUp 0x2000 // Root power domain change up +#define kIOPMRootChangeDown 0x4000 // Root power domain change down + +#define kIOPMRootBroadcastFlags (kIOPMSynchronize | \ + kIOPMRootChangeUp | kIOPMRootChangeDown) + +// Activity tickle request flags +#define kTickleTypePowerDrop 0x01 +#define kTickleTypePowerRise 0x02 +#define kTickleTypeActivity 0x04 +#define kTickleTypeAdvisory 0x08 enum { kDriverCallInformPreChange, diff --git a/kgmacros b/kgmacros index 0f5dcbc2b..6a12bb11c 100644 --- a/kgmacros +++ b/kgmacros @@ -807,7 +807,7 @@ define showactint set $kgm_actint_framecount = 0 while ($mysp != 0) && (($mysp & $stkmask) == 0) \ && ($mysp != $prevsp) \ - && ((((unsigned long) $mysp ^ (unsigned long) $prevsp) < 0x2000) \ + && ((((unsigned long) $mysp - (unsigned long) $prevsp) < 0x4000) \ || (((unsigned long)$mysp < ((unsigned long) ($kgm_thread->kernel_stack+kernel_stack_size))) \ && ((unsigned long)$mysp > (unsigned long) ($kgm_thread->kernel_stack)))) \ && ($kgm_actint_framecount < 128) diff --git a/libkern/c++/OSMetaClass.cpp b/libkern/c++/OSMetaClass.cpp index 8bd5fa522..fafd4547c 100644 --- a/libkern/c++/OSMetaClass.cpp +++ b/libkern/c++/OSMetaClass.cpp @@ -813,8 +813,10 @@ OSMetaClass::removeInstance(const OSObject * instance, bool super) const if (superClassLink) { superClassLink->removeInstance(reserved->instances, true); } + IOLockLock(sAllClassesLock); reserved->instances->release(); reserved->instances = 0; + IOLockUnlock(sAllClassesLock); } } diff --git a/libsyscall/wrappers/__get_cpu_capabilities.s b/libsyscall/wrappers/__get_cpu_capabilities.s index f03e44420..de177986a 100644 --- a/libsyscall/wrappers/__get_cpu_capabilities.s +++ b/libsyscall/wrappers/__get_cpu_capabilities.s @@ -33,8 +33,8 @@ .align 2, 0x90 .globl __get_cpu_capabilities __get_cpu_capabilities: - movq $(_COMM_PAGE_CPU_CAPABILITIES), %rax - movl (%rax), %eax + movq $(_COMM_PAGE_CPU_CAPABILITIES64), %rax + movq (%rax), %rax ret #elif defined(__i386__) @@ -43,7 +43,8 @@ __get_cpu_capabilities: .align 2, 0x90 .globl __get_cpu_capabilities __get_cpu_capabilities: - movl _COMM_PAGE_CPU_CAPABILITIES, %eax + movl _COMM_PAGE_CPU_CAPABILITIES64, %eax + movl _COMM_PAGE_CPU_CAPABILITIES64+4, %edx ret #else diff --git a/osfmk/conf/files.x86_64 b/osfmk/conf/files.x86_64 index 02f3cadf8..50345face 100644 --- a/osfmk/conf/files.x86_64 +++ b/osfmk/conf/files.x86_64 @@ -134,3 +134,4 @@ osfmk/kperf/x86_64/kperf_mp.c optional kperf osfmk/i386/startup64.c standard osfmk/x86_64/idt64.s standard + diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index 6315fc5e9..8d01210e9 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -217,10 +217,6 @@ machine_startup(void) machine_conf(); -#if NOTYET - ml_thrm_init(); /* Start thermal monitoring on this processor */ -#endif - /* * Start the system. */ diff --git a/osfmk/i386/Diagnostics.c b/osfmk/i386/Diagnostics.c index 0921ad575..3023aeef9 100644 --- a/osfmk/i386/Diagnostics.c +++ b/osfmk/i386/Diagnostics.c @@ -72,6 +72,12 @@ #include #include +#include +#include +#include + +#define PERMIT_PERMCHECK (0) + diagWork dgWork; uint64_t lastRuptClear = 0ULL; @@ -80,17 +86,33 @@ void cpu_powerstats(void *); typedef struct { uint64_t caperf; uint64_t cmperf; - uint64_t ccres[3]; - uint64_t crtimes[4]; - uint64_t citimes[4]; + uint64_t ccres[6]; + uint64_t crtimes[CPU_RTIME_BINS]; + uint64_t citimes[CPU_ITIME_BINS]; uint64_t crtime_total; uint64_t citime_total; + uint64_t cpu_idle_exits; + uint64_t cpu_insns; + uint64_t cpu_ucc; + uint64_t cpu_urc; } core_energy_stat_t; typedef struct { - uint64_t pkg_cres[2][4]; + uint64_t pkg_cres[2][7]; uint64_t pkg_power_unit; uint64_t pkg_energy; + uint64_t pp0_energy; + uint64_t pp1_energy; + uint64_t ddr_energy; + uint64_t llc_flushed_cycles; + uint64_t ring_ratio_instantaneous; + uint64_t IA_frequency_clipping_cause; + uint64_t GT_frequency_clipping_cause; + uint64_t pkg_idle_exits; + uint64_t pkg_rtimes[CPU_RTIME_BINS]; + uint64_t pkg_itimes[CPU_ITIME_BINS]; + uint64_t mbus_delay_time; + uint64_t mint_delay_time; uint32_t ncpus; core_energy_stat_t cest[]; } pkg_energy_statistics_t; @@ -99,9 +121,9 @@ typedef struct { int diagCall64(x86_saved_state_t * state) { - uint64_t curpos, i, j; - uint64_t selector, data; - uint64_t currNap, durNap; + uint64_t curpos, i, j; + uint64_t selector, data; + uint64_t currNap, durNap; x86_saved_state64_t *regs; boolean_t diagflag; uint32_t rval = 0; @@ -175,12 +197,54 @@ diagCall64(x86_saved_state_t * state) pkes.pkg_cres[0][2] = ((uint64_t)c6h << 32) | c6l; pkes.pkg_cres[0][3] = ((uint64_t)c7h << 32) | c7l; + uint32_t cpumodel = cpuid_info()->cpuid_model; + boolean_t c8avail; + switch (cpumodel) { + case CPUID_MODEL_HASWELL_ULT: + c8avail = TRUE; + break; + default: + c8avail = FALSE; + break; + } + uint64_t c8r = ~0ULL, c9r = ~0ULL, c10r = ~0ULL; + + if (c8avail) { + rdmsr64_carefully(MSR_IA32_PKG_C8_RESIDENCY, &c8r); + rdmsr64_carefully(MSR_IA32_PKG_C9_RESIDENCY, &c9r); + rdmsr64_carefully(MSR_IA32_PKG_C10_RESIDENCY, &c10r); + } + + pkes.pkg_cres[0][4] = c8r; + pkes.pkg_cres[0][5] = c9r; + pkes.pkg_cres[0][6] = c10r; + + pkes.ddr_energy = ~0ULL; + rdmsr64_carefully(MSR_IA32_DDR_ENERGY_STATUS, &pkes.ddr_energy); + pkes.llc_flushed_cycles = ~0ULL; + rdmsr64_carefully(MSR_IA32_LLC_FLUSHED_RESIDENCY_TIMER, &pkes.llc_flushed_cycles); + + pkes.ring_ratio_instantaneous = ~0ULL; + rdmsr64_carefully(MSR_IA32_RING_PERF_STATUS, &pkes.ring_ratio_instantaneous); + + pkes.IA_frequency_clipping_cause = ~0ULL; + rdmsr64_carefully(MSR_IA32_IA_PERF_LIMIT_REASONS, &pkes.IA_frequency_clipping_cause); + + pkes.GT_frequency_clipping_cause = ~0ULL; + rdmsr64_carefully(MSR_IA32_GT_PERF_LIMIT_REASONS, &pkes.GT_frequency_clipping_cause); + rdmsr_carefully(MSR_IA32_PKG_POWER_SKU_UNIT, &pkg_unit_l, &pkg_unit_h); rdmsr_carefully(MSR_IA32_PKG_ENERGY_STATUS, &pkg_ecl, &pkg_ech); - pkes.pkg_power_unit = ((uint64_t)pkg_unit_h << 32) | pkg_unit_l; pkes.pkg_energy = ((uint64_t)pkg_ech << 32) | pkg_ecl; + rdmsr_carefully(MSR_IA32_PP0_ENERGY_STATUS, &pkg_ecl, &pkg_ech); + pkes.pp0_energy = ((uint64_t)pkg_ech << 32) | pkg_ecl; + + rdmsr_carefully(MSR_IA32_PP1_ENERGY_STATUS, &pkg_ecl, &pkg_ech); + pkes.pp1_energy = ((uint64_t)pkg_ech << 32) | pkg_ecl; + + pkes.pkg_idle_exits = current_cpu_datap()->lcpu.package->package_idle_exits; pkes.ncpus = real_ncpus; (void) ml_set_interrupts_enabled(TRUE); @@ -191,6 +255,8 @@ diagCall64(x86_saved_state_t * state) mp_cpus_call(CPUMASK_ALL, ASYNC, cpu_powerstats, NULL); for (i = 0; i < real_ncpus; i++) { + (void) ml_set_interrupts_enabled(FALSE); + cest.caperf = cpu_data_ptr[i]->cpu_aperf; cest.cmperf = cpu_data_ptr[i]->cpu_mperf; cest.ccres[0] = cpu_data_ptr[i]->cpu_c3res; @@ -199,8 +265,14 @@ diagCall64(x86_saved_state_t * state) bcopy(&cpu_data_ptr[i]->cpu_rtimes[0], &cest.crtimes[0], sizeof(cest.crtimes)); bcopy(&cpu_data_ptr[i]->cpu_itimes[0], &cest.citimes[0], sizeof(cest.citimes)); + cest.citime_total = cpu_data_ptr[i]->cpu_itime_total; cest.crtime_total = cpu_data_ptr[i]->cpu_rtime_total; + cest.cpu_idle_exits = cpu_data_ptr[i]->cpu_idle_exits; + cest.cpu_insns = cpu_data_ptr[i]->cpu_cur_insns; + cest.cpu_ucc = cpu_data_ptr[i]->cpu_cur_ucc; + cest.cpu_urc = cpu_data_ptr[i]->cpu_cur_urc; + (void) ml_set_interrupts_enabled(TRUE); copyout(&cest, curpos, sizeof(cest)); curpos += sizeof(cest); @@ -208,6 +280,13 @@ diagCall64(x86_saved_state_t * state) rval = 1; } break; + case dgEnaPMC: + { + boolean_t enable = TRUE; + mp_cpus_call(CPUMASK_ALL, ASYNC, cpu_pmc_control, &enable); + rval = 1; + } + break; #if DEBUG case dgGzallocTest: @@ -220,10 +299,10 @@ diagCall64(x86_saved_state_t * state) kfree(ptr, 1024); *ptr = 0x42; } - break; + break; #endif -#if defined(__x86_64__) +#if PERMIT_PERMCHECK case dgPermCheck: { (void) ml_set_interrupts_enabled(TRUE); @@ -233,7 +312,7 @@ diagCall64(x86_saved_state_t * state) rval = pmap_permissions_verify(kernel_pmap, kernel_map, 0, ~0ULL); } break; -#endif /* __x86_64__*/ +#endif /* PERMIT_PERMCHECK */ default: /* Handle invalid ones */ rval = 0; /* Return an exception */ @@ -246,7 +325,7 @@ diagCall64(x86_saved_state_t * state) void cpu_powerstats(__unused void *arg) { cpu_data_t *cdp = current_cpu_datap(); - int cnum = cdp->cpu_number; + __unused int cnum = cdp->cpu_number; uint32_t cl = 0, ch = 0, mpl = 0, mph = 0, apl = 0, aph = 0; rdmsr_carefully(MSR_IA32_MPERF, &mpl, &mph); @@ -255,8 +334,9 @@ void cpu_powerstats(__unused void *arg) { cdp->cpu_mperf = ((uint64_t)mph << 32) | mpl; cdp->cpu_aperf = ((uint64_t)aph << 32) | apl; - if (cnum & 1) - return; + uint64_t ctime = mach_absolute_time(); + cdp->cpu_rtime_total += ctime - cdp->cpu_ixtime; + cdp->cpu_ixtime = ctime; rdmsr_carefully(MSR_IA32_CORE_C3_RESIDENCY, &cl, &ch); cdp->cpu_c3res = ((uint64_t)ch << 32) | cl; @@ -266,4 +346,28 @@ void cpu_powerstats(__unused void *arg) { rdmsr_carefully(MSR_IA32_CORE_C7_RESIDENCY, &cl, &ch); cdp->cpu_c7res = ((uint64_t)ch << 32) | cl; + + uint64_t insns = read_pmc(FIXED_PMC0); + uint64_t ucc = read_pmc(FIXED_PMC1); + uint64_t urc = read_pmc(FIXED_PMC2); + cdp->cpu_cur_insns = insns; + cdp->cpu_cur_ucc = ucc; + cdp->cpu_cur_urc = urc; +} + +void cpu_pmc_control(void *enablep) { + boolean_t enable = *(boolean_t *)enablep; + cpu_data_t *cdp = current_cpu_datap(); + + if (enable) { + wrmsr64(0x38F, 0x70000000FULL); + wrmsr64(0x38D, 0x333); + set_cr4(get_cr4() | CR4_PCE); + + } else { + wrmsr64(0x38F, 0); + wrmsr64(0x38D, 0); + set_cr4((get_cr4() & ~CR4_PCE)); + } + cdp->cpu_fixed_pmcs_enabled = enable; } diff --git a/osfmk/i386/Diagnostics.h b/osfmk/i386/Diagnostics.h index 2ce145e27..4e37eea2b 100644 --- a/osfmk/i386/Diagnostics.h +++ b/osfmk/i386/Diagnostics.h @@ -70,7 +70,7 @@ int diagCall64(x86_saved_state_t *regs); #define dgBind 18 #define dgAcntg 20 #define dgKlra 21 -#define dgKfree 22 +#define dgEnaPMC 22 #define dgWar 23 #define dgNapStat 24 #define dgRuptStat 25 @@ -100,7 +100,17 @@ typedef struct diagWork { /* Diagnostic work area */ extern diagWork dgWork; - +#define FIXED_PMC (1 << 30) +#define FIXED_PMC0 (FIXED_PMC) +#define FIXED_PMC1 (FIXED_PMC | 1) +#define FIXED_PMC2 (FIXED_PMC | 2) + +static inline uint64_t read_pmc(uint32_t counter) +{ + uint32_t lo = 0, hi = 0; + __asm__ volatile("rdpmc" : "=a" (lo), "=d" (hi) : "c" (counter)); + return ((((uint64_t)hi) << 32) | ((uint64_t)lo)); +} #endif /* _DIAGNOSTICS_H_ */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c index 4106c9283..69a45836a 100644 --- a/osfmk/i386/acpi.c +++ b/osfmk/i386/acpi.c @@ -53,6 +53,9 @@ #include #include +#include +#include +#include #include #include #include @@ -61,12 +64,11 @@ #include #endif #include - #include #if CONFIG_SLEEP extern void acpi_sleep_cpu(acpi_sleep_callback, void * refcon); -extern void acpi_wake_prot(void); +extern void acpi_wake_prot(void); #endif extern kern_return_t IOCPURunPlatformQuiesceActions(void); extern kern_return_t IOCPURunPlatformActiveActions(void); @@ -93,6 +95,9 @@ typedef struct acpi_hibernate_callback_data acpi_hibernate_callback_data_t; unsigned int save_kdebug_enable = 0; static uint64_t acpi_sleep_abstime; +static uint64_t acpi_idle_abstime; +static uint64_t acpi_wake_abstime; +boolean_t deep_idle_rebase = TRUE; #if CONFIG_SLEEP static void @@ -153,6 +158,7 @@ acpi_hibernate(void *refcon) extern void slave_pstart(void); +extern unsigned int wake_nkdbufs; void acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) @@ -302,11 +308,22 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) if (lapic_probe()) lapic_configure(); + acpi_wake_abstime = mach_absolute_time(); + /* let the realtime clock reset */ rtc_sleep_wakeup(acpi_sleep_abstime); kdebug_enable = save_kdebug_enable; + if (kdebug_enable == 0) { + if (wake_nkdbufs) + start_kern_tracing(wake_nkdbufs, TRUE); + } + + /* Reconfigure FP/SIMD unit */ + init_fpu(); + clear_ts(); + IOCPURunPlatformActiveActions(); if (did_hibernate) { @@ -334,8 +351,7 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) /* Restart timer interrupts */ rtc_timer_start(); - /* Reconfigure FP/SIMD unit */ - init_fpu(); + #if HIBERNATION #ifdef __i386__ @@ -358,6 +374,99 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) #endif } +/* + * acpi_idle_kernel is called by the ACPI Platform kext to request the kernel + * to idle the boot processor in the deepest C-state for S0 sleep. All slave + * processors are expected already to have been offlined in the deepest C-state. + * + * The contract with ACPI is that although the kernel is called with interrupts + * disabled, interrupts may need to be re-enabled to dismiss any pending timer + * interrupt. However, the callback function will be called once this has + * occurred and interrupts are guaranteed to be disabled at that time, + * and to remain disabled during C-state entry, exit (wake) and return + * from acpi_idle_kernel. + */ +void +acpi_idle_kernel(acpi_sleep_callback func, void *refcon) +{ + boolean_t istate = ml_get_interrupts_enabled(); + + kprintf("acpi_idle_kernel, cpu=%d, interrupts %s\n", + cpu_number(), istate ? "enabled" : "disabled"); + + assert(cpu_number() == master_cpu); + + /* + * Effectively set the boot cpu offline. + * This will stop further deadlines being set. + */ + cpu_datap(master_cpu)->cpu_running = FALSE; + + /* Cancel any pending deadline */ + setPop(0); + while (lapic_is_interrupting(LAPIC_TIMER_VECTOR)) { + (void) ml_set_interrupts_enabled(TRUE); + setPop(0); + ml_set_interrupts_enabled(FALSE); + } + + /* + * Call back to caller to indicate that interrupts will remain + * disabled while we deep idle, wake and return. + */ + func(refcon); + + acpi_idle_abstime = mach_absolute_time(); + + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEEP_IDLE) | DBG_FUNC_START, + acpi_idle_abstime, deep_idle_rebase, 0, 0, 0); + + /* + * Disable tracing during S0-sleep + * unless overridden by sysctl -w tsc.deep_idle_rebase=0 + */ + if (deep_idle_rebase) { + save_kdebug_enable = kdebug_enable; + kdebug_enable = 0; + } + + /* + * Call into power-management to enter the lowest C-state. + * Note when called on the boot processor this routine will + * return directly when awoken. + */ + pmCPUHalt(PM_HALT_SLEEP); + + /* + * Get wakeup time relative to the TSC which has progressed. + * Then rebase nanotime to reflect time not progressing over sleep + * - unless overriden so that tracing can occur during deep_idle. + */ + acpi_wake_abstime = mach_absolute_time(); + if (deep_idle_rebase) { + rtc_sleep_wakeup(acpi_idle_abstime); + kdebug_enable = save_kdebug_enable; + } + + cpu_datap(master_cpu)->cpu_running = TRUE; + + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEEP_IDLE) | DBG_FUNC_END, + acpi_wake_abstime, acpi_wake_abstime - acpi_idle_abstime, 0, 0, 0); + + /* Like S3 sleep, turn on tracing if trace_wake boot-arg is present */ + if (kdebug_enable == 0) { + if (wake_nkdbufs) + start_kern_tracing(wake_nkdbufs, TRUE); + } + + IOCPURunPlatformActiveActions(); + + /* Restart timer interrupts */ + rtc_timer_start(); +} + extern char real_mode_bootstrap_end[]; extern char real_mode_bootstrap_base[]; diff --git a/osfmk/i386/acpi.h b/osfmk/i386/acpi.h index a64e8127c..ed8fdd08e 100644 --- a/osfmk/i386/acpi.h +++ b/osfmk/i386/acpi.h @@ -46,6 +46,7 @@ typedef void (*acpi_sleep_callback)(void * refcon); extern vm_offset_t acpi_install_wake_handler(void); extern void acpi_sleep_kernel(acpi_sleep_callback func, void * refcon); +extern void acpi_idle_kernel(acpi_sleep_callback func, void * refcon); void install_real_mode_bootstrap(void *prot_entry); #endif /* ASSEMBLER */ diff --git a/osfmk/i386/commpage/commpage.c b/osfmk/i386/commpage/commpage.c index 7076ff533..30ea3e10f 100644 --- a/osfmk/i386/commpage/commpage.c +++ b/osfmk/i386/commpage/commpage.c @@ -80,17 +80,14 @@ extern vm_map_t commpage_text64_map; // the shared submap, set up in vm init char *commPagePtr32 = NULL; // virtual addr in kernel map of 32-bit commpage char *commPagePtr64 = NULL; // ...and of 64-bit commpage -char *commPageTextPtr32 = NULL; // virtual addr in kernel map of 32-bit commpage -char *commPageTextPtr64 = NULL; // ...and of 64-bit commpage -uint32_t _cpu_capabilities = 0; // define the capability vector +char *commPageTextPtr32 = NULL; // virtual addr in kernel map of 32-bit commpage +char *commPageTextPtr64 = NULL; // ...and of 64-bit commpage -int noVMX = 0; /* if true, do not set kHasAltivec in ppc _cpu_capabilities */ +uint64_t _cpu_capabilities = 0; // define the capability vector typedef uint32_t commpage_address_t; -static commpage_address_t next; // next available address in comm page -static commpage_address_t cur_routine; // comm page address of "current" routine -static boolean_t matched; // true if we've found a match for "current" routine +static commpage_address_t next; // next available address in comm page static char *commPagePtr; // virtual addr in kernel map of commpage we are working on static commpage_address_t commPageBaseOffset; // subtract from 32-bit runtime address to get offset in virtual commpage in kernel map @@ -205,7 +202,7 @@ commpage_cpus( void ) static void commpage_init_cpu_capabilities( void ) { - uint32_t bits; + uint64_t bits; int cpus; ml_cpu_info_t cpu_info; @@ -254,30 +251,46 @@ commpage_init_cpu_capabilities( void ) } cpus = commpage_cpus(); // how many CPUs do we have - if (cpus == 1) - bits |= kUP; - bits |= (cpus << kNumCPUsShift); bits |= kFastThreadLocalStorage; // we use %gs for TLS - if (cpu_mode_is64bit()) // k64Bit means processor is 64-bit capable - bits |= k64Bit; - - if (tscFreq <= SLOW_TSC_THRESHOLD) /* is TSC too slow for _commpage_nanotime? */ - bits |= kSlow; - - bits |= (cpuid_features() & CPUID_FEATURE_AES) ? kHasAES : 0; - - bits |= (cpuid_features() & CPUID_FEATURE_F16C) ? kHasF16C : 0; - bits |= (cpuid_features() & CPUID_FEATURE_RDRAND) ? kHasRDRAND : 0; - bits |= ((cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_ENFSTRG) && - (rdmsr64(MSR_IA32_MISC_ENABLE) & 1ULL )) ? kHasENFSTRG : 0; - +#define setif(_bits, _bit, _condition) \ + if (_condition) _bits |= _bit + + setif(bits, kUP, cpus == 1); + setif(bits, k64Bit, cpu_mode_is64bit()); + setif(bits, kSlow, tscFreq <= SLOW_TSC_THRESHOLD); + + setif(bits, kHasAES, cpuid_features() & + CPUID_FEATURE_AES); + setif(bits, kHasF16C, cpuid_features() & + CPUID_FEATURE_F16C); + setif(bits, kHasRDRAND, cpuid_features() & + CPUID_FEATURE_RDRAND); + setif(bits, kHasFMA, cpuid_features() & + CPUID_FEATURE_FMA); + + setif(bits, kHasBMI1, cpuid_leaf7_features() & + CPUID_LEAF7_FEATURE_BMI1); + setif(bits, kHasBMI2, cpuid_leaf7_features() & + CPUID_LEAF7_FEATURE_BMI2); + setif(bits, kHasRTM, cpuid_leaf7_features() & + CPUID_LEAF7_FEATURE_RTM); + setif(bits, kHasHLE, cpuid_leaf7_features() & + CPUID_LEAF7_FEATURE_HLE); + setif(bits, kHasAVX2_0, cpuid_leaf7_features() & + CPUID_LEAF7_FEATURE_AVX2); + + uint64_t misc_enable = rdmsr64(MSR_IA32_MISC_ENABLE); + setif(bits, kHasENFSTRG, (misc_enable & 1ULL) && + (cpuid_leaf7_features() & + CPUID_LEAF7_FEATURE_ENFSTRG)); + _cpu_capabilities = bits; // set kernel version for use by drivers etc } -int +uint64_t _get_cpu_capabilities(void) { return _cpu_capabilities; @@ -305,27 +318,9 @@ commpage_stuff( */ static void commpage_stuff_routine( - commpage_descriptor *rd ) + commpage_descriptor *rd ) { - uint32_t must,cant; - - if (rd->commpage_address != cur_routine) { - if ((cur_routine!=0) && (matched==0)) - panic("commpage no match for last, next address %08x", rd->commpage_address); - cur_routine = rd->commpage_address; - matched = 0; - } - - must = _cpu_capabilities & rd->musthave; - cant = _cpu_capabilities & rd->canthave; - - if ((must == rd->musthave) && (cant == 0)) { - if (matched) - panic("commpage multiple matches for address %08x", rd->commpage_address); - matched = 1; - - commpage_stuff(rd->commpage_address,rd->code_address,rd->code_length); - } + commpage_stuff(rd->commpage_address,rd->code_address,rd->code_length); } /* Fill in the 32- or 64-bit commpage. Called once for each. @@ -341,15 +336,14 @@ commpage_populate_one( const char* signature, // "commpage 32-bit" or "commpage 64-bit" vm_prot_t uperm) { - uint8_t c1; - short c2; - int c4; - uint64_t c8; + uint8_t c1; + uint16_t c2; + int c4; + uint64_t c8; uint32_t cfamily; short version = _COMM_PAGE_THIS_VERSION; next = 0; - cur_routine = 0; commPagePtr = (char *)commpage_allocate( submap, (vm_size_t) area_used, uperm ); *kernAddressPtr = commPagePtr; // save address either in commPagePtr32 or 64 commPageBaseOffset = base_offset; @@ -358,10 +352,13 @@ commpage_populate_one( /* Stuff in the constants. We move things into the comm page in strictly * ascending order, so we can check for overlap and panic if so. + * Note: the 32-bit cpu_capabilities vector is retained in addition to + * the expanded 64-bit vector. */ - commpage_stuff(_COMM_PAGE_SIGNATURE,signature,(int)strlen(signature)); + commpage_stuff(_COMM_PAGE_SIGNATURE,signature,(int)MIN(_COMM_PAGE_SIGNATURELEN, strlen(signature))); + commpage_stuff(_COMM_PAGE_CPU_CAPABILITIES64,&_cpu_capabilities,sizeof(_cpu_capabilities)); commpage_stuff(_COMM_PAGE_VERSION,&version,sizeof(short)); - commpage_stuff(_COMM_PAGE_CPU_CAPABILITIES,&_cpu_capabilities,sizeof(int)); + commpage_stuff(_COMM_PAGE_CPU_CAPABILITIES,&_cpu_capabilities,sizeof(uint32_t)); c2 = 32; // default if (_cpu_capabilities & kCache64) @@ -369,7 +366,7 @@ commpage_populate_one( else if (_cpu_capabilities & kCache128) c2 = 128; commpage_stuff(_COMM_PAGE_CACHE_LINESIZE,&c2,2); - + c4 = MP_SPIN_TRIES; commpage_stuff(_COMM_PAGE_SPIN_COUNT,&c4,4); @@ -442,8 +439,7 @@ commpage_populate( void ) void commpage_text_populate( void ){ commpage_descriptor **rd; - next =0; - cur_routine=0; + next = 0; commPagePtr = (char *) commpage_allocate(commpage_text32_map, (vm_size_t) _COMM_PAGE_TEXT_AREA_USED, VM_PROT_READ | VM_PROT_EXECUTE); commPageTextPtr32 = commPagePtr; @@ -457,8 +453,6 @@ void commpage_text_populate( void ){ for (rd = commpage_32_routines; *rd != NULL; rd++) { commpage_stuff_routine(*rd); } - if (!matched) - panic(" commpage_text no match for last routine "); #ifndef __LP64__ pmap_commpage32_init((vm_offset_t) commPageTextPtr32, _COMM_PAGE_TEXT_START, @@ -466,8 +460,7 @@ void commpage_text_populate( void ){ #endif if (_cpu_capabilities & k64Bit) { - next =0; - cur_routine=0; + next = 0; commPagePtr = (char *) commpage_allocate(commpage_text64_map, (vm_size_t) _COMM_PAGE_TEXT_AREA_USED, VM_PROT_READ | VM_PROT_EXECUTE); commPageTextPtr64 = commPagePtr; @@ -486,17 +479,12 @@ void commpage_text_populate( void ){ #endif } - if (!matched) - panic(" commpage_text no match for last routine "); - if (next > _COMM_PAGE_TEXT_END) panic("commpage text overflow: next=0x%08x, commPagePtr=%p", next, commPagePtr); } -/* Update commpage nanotime information. Note that we interleave - * setting the 32- and 64-bit commpages, in order to keep nanotime more - * nearly in sync between the two environments. +/* Update commpage nanotime information. * * This routine must be serialized by some external means, ie a lock. */ @@ -520,7 +508,7 @@ commpage_set_nanotime( panic("nanotime trouble 1"); /* possibly not serialized */ if ( ns_base < p32->nt_ns_base ) panic("nanotime trouble 2"); - if ((shift != 32) && ((_cpu_capabilities & kSlow)==0) ) + if ((shift != 0) && ((_cpu_capabilities & kSlow)==0) ) panic("nanotime trouble 3"); next_gen = ++generation; @@ -604,14 +592,14 @@ commpage_set_memory_pressure( cp = commPagePtr32; if ( cp ) { cp += (_COMM_PAGE_MEMORY_PRESSURE - _COMM_PAGE32_BASE_ADDRESS); - ip = (uint32_t*) cp; + ip = (uint32_t*) (void *) cp; *ip = (uint32_t) pressure; } cp = commPagePtr64; if ( cp ) { cp += (_COMM_PAGE_MEMORY_PRESSURE - _COMM_PAGE32_START_ADDRESS); - ip = (uint32_t*) cp; + ip = (uint32_t*) (void *) cp; *ip = (uint32_t) pressure; } @@ -633,14 +621,14 @@ commpage_set_spin_count( cp = commPagePtr32; if ( cp ) { cp += (_COMM_PAGE_SPIN_COUNT - _COMM_PAGE32_BASE_ADDRESS); - ip = (uint32_t*) cp; + ip = (uint32_t*) (void *) cp; *ip = (uint32_t) count; } cp = commPagePtr64; if ( cp ) { cp += (_COMM_PAGE_SPIN_COUNT - _COMM_PAGE32_START_ADDRESS); - ip = (uint32_t*) cp; + ip = (uint32_t*) (void *) cp; *ip = (uint32_t) count; } diff --git a/osfmk/i386/commpage/commpage.h b/osfmk/i386/commpage/commpage.h index c8369d78d..030d294af 100644 --- a/osfmk/i386/commpage/commpage.h +++ b/osfmk/i386/commpage/commpage.h @@ -81,7 +81,7 @@ .align alignment, 0x90 ;\ L ## label ## : -#define COMMPAGE_DESCRIPTOR(label,address,must,cant) \ +#define COMMPAGE_DESCRIPTOR(label,address) \ L ## label ## _end: ;\ .set L ## label ## _size, L ## label ## _end - L ## label ;\ .const_data ;\ @@ -90,8 +90,6 @@ COMMPAGE_DESCRIPTOR_NAME(label) ## : ;\ COMMPAGE_DESCRIPTOR_FIELD_POINTER L ## label ;\ .long L ## label ## _size ;\ .long address ;\ - .long must ;\ - .long cant ;\ .text @@ -131,8 +129,6 @@ typedef struct commpage_descriptor { void *code_address; // address of code uint32_t code_length; // length in bytes uint32_t commpage_address; // put at this address (_COMM_PAGE_BCOPY etc) - uint32_t musthave; // _cpu_capability bits we must have - uint32_t canthave; // _cpu_capability bits we can't have } commpage_descriptor; diff --git a/osfmk/i386/commpage/fifo_queues.s b/osfmk/i386/commpage/fifo_queues.s index e994ae945..81c041ae4 100644 --- a/osfmk/i386/commpage/fifo_queues.s +++ b/osfmk/i386/commpage/fifo_queues.s @@ -83,7 +83,7 @@ COMMPAGE_FUNCTION_START(preempt, 32, 4) movl $(-58),%eax /* 58 = pfz_exit */ xorl %ebx,%ebx // clear "preemption pending" flag sysenter -COMMPAGE_DESCRIPTOR(preempt,_COMM_PAGE_PREEMPT,0,0) +COMMPAGE_DESCRIPTOR(preempt,_COMM_PAGE_PREEMPT) /* Subroutine to back off if we cannot get the spinlock. Called @@ -107,7 +107,7 @@ COMMPAGE_FUNCTION_START(backoff, 32, 4) cmpl $0,8(%edi) // sniff the lockword jnz 1b // loop if still taken ret // lockword is free, so reenter PFZ -COMMPAGE_DESCRIPTOR(backoff,_COMM_PAGE_BACKOFF,0,0) +COMMPAGE_DESCRIPTOR(backoff,_COMM_PAGE_BACKOFF) /* Preemption-free-zone routine to FIFO Enqueue: @@ -152,7 +152,7 @@ COMMPAGE_FUNCTION_START(pfz_enqueue, 32, 4) movl %esi,4(%edi) // new element becomes last in q movl $0,8(%edi) // unlock spinlock ret -COMMPAGE_DESCRIPTOR(pfz_enqueue,_COMM_PAGE_PFZ_ENQUEUE,0,0) +COMMPAGE_DESCRIPTOR(pfz_enqueue,_COMM_PAGE_PFZ_ENQUEUE) /* Preemption-free-zone routine to FIFO Dequeue: @@ -198,7 +198,7 @@ COMMPAGE_FUNCTION_START(pfz_dequeue, 32, 4) 4: movl $0,8(%edi) // unlock spinlock ret -COMMPAGE_DESCRIPTOR(pfz_dequeue,_COMM_PAGE_PFZ_DEQUEUE,0,0) +COMMPAGE_DESCRIPTOR(pfz_dequeue,_COMM_PAGE_PFZ_DEQUEUE) @@ -233,7 +233,7 @@ COMMPAGE_FUNCTION_START(preempt_64, 64, 4) popq %rcx popq %rax ret -COMMPAGE_DESCRIPTOR(preempt_64,_COMM_PAGE_PREEMPT,0,0) +COMMPAGE_DESCRIPTOR(preempt_64,_COMM_PAGE_PREEMPT) /* Subroutine to back off if we cannot get the spinlock. Called @@ -252,7 +252,7 @@ COMMPAGE_FUNCTION_START(backoff_64, 64, 4) cmpl $0,16(%rdi) // sniff the lockword jnz 1b // loop if still taken ret // lockword is free, so reenter PFZ -COMMPAGE_DESCRIPTOR(backoff_64,_COMM_PAGE_BACKOFF,0,0) +COMMPAGE_DESCRIPTOR(backoff_64,_COMM_PAGE_BACKOFF) /* Preemption-free-zone routine to FIFO Enqueue: @@ -297,7 +297,7 @@ COMMPAGE_FUNCTION_START(pfz_enqueue_64, 64, 4) movq %rsi,8(%rdi) // new element becomes last in q movl $0,16(%rdi) // unlock spinlock ret -COMMPAGE_DESCRIPTOR(pfz_enqueue_64,_COMM_PAGE_PFZ_ENQUEUE,0,0) +COMMPAGE_DESCRIPTOR(pfz_enqueue_64,_COMM_PAGE_PFZ_ENQUEUE) @@ -344,4 +344,4 @@ COMMPAGE_FUNCTION_START(pfz_dequeue_64, 64, 4) 4: movl $0,16(%rdi) // unlock spinlock ret -COMMPAGE_DESCRIPTOR(pfz_dequeue_64,_COMM_PAGE_PFZ_DEQUEUE,0,0) +COMMPAGE_DESCRIPTOR(pfz_dequeue_64,_COMM_PAGE_PFZ_DEQUEUE) diff --git a/osfmk/i386/commpage/pthreads.s b/osfmk/i386/commpage/pthreads.s index a7226180b..c62094de1 100644 --- a/osfmk/i386/commpage/pthreads.s +++ b/osfmk/i386/commpage/pthreads.s @@ -155,7 +155,7 @@ COMMPAGE_FUNCTION_START(pfz_mutex_lock, 32, 4) orl $0x00180000,%eax // copy 24 bytes of arguments in trampoline xorl %ebx,%ebx // clear preemption flag sysenter -COMMPAGE_DESCRIPTOR(pfz_mutex_lock,_COMM_PAGE_PFZ_MUTEX_LOCK,0,0) +COMMPAGE_DESCRIPTOR(pfz_mutex_lock,_COMM_PAGE_PFZ_MUTEX_LOCK) @@ -224,5 +224,5 @@ COMMPAGE_FUNCTION_START(pfz_mutex_lock_64, 64, 4) movl $PTHRW_STATUS_SYSCALL,%eax // we made syscall popq %rbp ret -COMMPAGE_DESCRIPTOR(pfz_mutex_lock_64,_COMM_PAGE_PFZ_MUTEX_LOCK,0,0) +COMMPAGE_DESCRIPTOR(pfz_mutex_lock_64,_COMM_PAGE_PFZ_MUTEX_LOCK) diff --git a/osfmk/i386/cpu.c b/osfmk/i386/cpu.c index 4cdeed647..23b38e3db 100644 --- a/osfmk/i386/cpu.c +++ b/osfmk/i386/cpu.c @@ -91,8 +91,6 @@ cpu_sleep(void) { cpu_data_t *cdp = current_cpu_datap(); - i386_deactivate_cpu(); - PE_cpu_machine_quiesce(cdp->cpu_id); cpu_thread_halt(); diff --git a/osfmk/i386/cpu_capabilities.h b/osfmk/i386/cpu_capabilities.h index 3cf464e34..b8d5027af 100644 --- a/osfmk/i386/cpu_capabilities.h +++ b/osfmk/i386/cpu_capabilities.h @@ -57,23 +57,31 @@ #define kSlow 0x00004000 /* tsc < nanosecond */ #define kUP 0x00008000 /* set if (kNumCPUs == 1) */ #define kNumCPUs 0x00FF0000 /* number of CPUs (see _NumCPUs() below) */ +#define kNumCPUsShift 16 #define kHasAVX1_0 0x01000000 #define kHasRDRAND 0x02000000 #define kHasF16C 0x04000000 #define kHasENFSTRG 0x08000000 -#define kNumCPUsShift 16 /* see _NumCPUs() below */ +#define kHasFMA 0x10000000 +#define kHasAVX2_0 0x20000000 +#define kHasBMI1 0x40000000 +#define kHasBMI2 0x80000000 +/* Extending into 64-bits from here: */ +#define kHasRTM 0x0000000100000000ULL +#define kHasHLE 0x0000000200000000ULL + #ifndef __ASSEMBLER__ #include __BEGIN_DECLS -extern int _get_cpu_capabilities( void ); +extern uint64_t _get_cpu_capabilities( void ); __END_DECLS inline static int _NumCPUs( void ) { - return (_get_cpu_capabilities() & kNumCPUs) >> kNumCPUsShift; + return (int) (_get_cpu_capabilities() & kNumCPUs) >> kNumCPUsShift; } #endif /* __ASSEMBLER__ */ @@ -151,13 +159,16 @@ int _NumCPUs( void ) /* data in the comm page */ -#define _COMM_PAGE_SIGNATURE (_COMM_PAGE_START_ADDRESS+0x000) /* first few bytes are a signature */ +#define _COMM_PAGE_SIGNATURE (_COMM_PAGE_START_ADDRESS+0x000) /* first 16 bytes are a signature */ +#define _COMM_PAGE_SIGNATURELEN (0x10) +#define _COMM_PAGE_CPU_CAPABILITIES64 (_COMM_PAGE_START_ADDRESS+0x010) /* uint64_t _cpu_capabilities */ +#define _COMM_PAGE_UNUSED (_COMM_PAGE_START_ADDRESS+0x018) /* 6 unused bytes */ #define _COMM_PAGE_VERSION (_COMM_PAGE_START_ADDRESS+0x01E) /* 16-bit version# */ -#define _COMM_PAGE_THIS_VERSION 12 /* version of the commarea format */ +#define _COMM_PAGE_THIS_VERSION 13 /* in ver 13, _COMM_PAGE_NT_SHIFT defaults to 0 (was 32) */ -#define _COMM_PAGE_CPU_CAPABILITIES (_COMM_PAGE_START_ADDRESS+0x020) /* uint32_t _cpu_capabilities */ +#define _COMM_PAGE_CPU_CAPABILITIES (_COMM_PAGE_START_ADDRESS+0x020) /* uint32_t _cpu_capabilities (retained for compatibility) */ #define _COMM_PAGE_NCPUS (_COMM_PAGE_START_ADDRESS+0x022) /* uint8_t number of configured CPUs (hw.logicalcpu at boot time) */ -#define _COMM_PAGE_UNUSED0 (_COMM_PAGE_START_ADDRESS+0x024) /* 2 unused bytes, reserved for future expansion of cpu_capabilities */ +#define _COMM_PAGE_UNUSED0 (_COMM_PAGE_START_ADDRESS+0x024) /* 2 unused bytes, previouly reserved for expansion of cpu_capabilities */ #define _COMM_PAGE_CACHE_LINESIZE (_COMM_PAGE_START_ADDRESS+0x026) /* uint16_t cache line size */ #define _COMM_PAGE_SCHED_GEN (_COMM_PAGE_START_ADDRESS+0x028) /* uint32_t scheduler generation number (count of pre-emptions) */ diff --git a/osfmk/i386/cpu_data.h b/osfmk/i386/cpu_data.h index a501be695..620ba6a29 100644 --- a/osfmk/i386/cpu_data.h +++ b/osfmk/i386/cpu_data.h @@ -125,6 +125,10 @@ typedef struct { typedef uint16_t pcid_t; typedef uint8_t pcid_ref_t; + +#define CPU_RTIME_BINS (12) +#define CPU_ITIME_BINS (CPU_RTIME_BINS) + /* * Per-cpu data. * @@ -157,12 +161,9 @@ typedef struct cpu_data int cpu_prior_signals; /* Last set of events, * debugging */ - int cpu_mcount_off; /* mcount recursion */ ast_t cpu_pending_ast; - int cpu_type; - int cpu_subtype; - int cpu_threadtype; - int cpu_running; + volatile int cpu_running; + boolean_t cpu_fixed_pmcs_enabled; rtclock_timer_t rtclock_timer; boolean_t cpu_is64bit; volatile addr64_t cpu_active_cr3 __attribute((aligned(64))); @@ -188,9 +189,6 @@ typedef struct cpu_data struct fake_descriptor *cpu_ldtp; cpu_desc_index_t cpu_desc_index; int cpu_ldt; - boolean_t cpu_iflag; - boolean_t cpu_boot_complete; - int cpu_hibernate; #if NCOPY_WINDOWS > 0 vm_offset_t cpu_copywindow_base; uint64_t *cpu_copywindow_pdp; @@ -198,18 +196,13 @@ typedef struct cpu_data vm_offset_t cpu_physwindow_base; uint64_t *cpu_physwindow_ptep; #endif - void *cpu_hi_iss; #define HWINTCNT_SIZE 256 uint32_t cpu_hwIntCnt[HWINTCNT_SIZE]; /* Interrupt counts */ + uint64_t cpu_hwIntpexits[HWINTCNT_SIZE]; + uint64_t cpu_hwIntcexits[HWINTCNT_SIZE]; uint64_t cpu_dr7; /* debug control register */ uint64_t cpu_int_event_time; /* intr entry/exit time */ -#if CONFIG_VMX - vmx_cpu_t cpu_vmx; /* wonderful world of virtualization */ -#endif -#if CONFIG_MCA - struct mca_state *cpu_mca_state; /* State at MC fault */ -#endif uint64_t cpu_uber_arg_store; /* Double mapped address * of current thread's * uu_arg array. @@ -246,12 +239,17 @@ typedef struct cpu_data uint64_t cpu_c7res; uint64_t cpu_itime_total; uint64_t cpu_rtime_total; - uint64_t cpu_rtimes[4]; - uint64_t cpu_itimes[4]; uint64_t cpu_ixtime; + uint64_t cpu_idle_exits; + uint64_t cpu_rtimes[CPU_RTIME_BINS]; + uint64_t cpu_itimes[CPU_ITIME_BINS]; + uint64_t cpu_cur_insns; + uint64_t cpu_cur_ucc; + uint64_t cpu_cur_urc; uint64_t cpu_max_observed_int_latency; int cpu_max_observed_int_latency_vector; uint64_t debugger_entry_time; + uint64_t debugger_ipi_time; volatile boolean_t cpu_NMI_acknowledged; /* A separate nested interrupt stack flag, to account * for non-nested interrupts arriving while on the interrupt stack @@ -262,6 +260,18 @@ typedef struct cpu_data uint32_t cpu_nested_istack_events; x86_saved_state64_t *cpu_fatal_trap_state; x86_saved_state64_t *cpu_post_fatal_trap_state; +#if CONFIG_VMX + vmx_cpu_t cpu_vmx; /* wonderful world of virtualization */ +#endif +#if CONFIG_MCA + struct mca_state *cpu_mca_state; /* State at MC fault */ +#endif + int cpu_type; + int cpu_subtype; + int cpu_threadtype; + boolean_t cpu_iflag; + boolean_t cpu_boot_complete; + int cpu_hibernate; } cpu_data_t; extern cpu_data_t *cpu_data_ptr[]; diff --git a/osfmk/i386/cpu_topology.h b/osfmk/i386/cpu_topology.h index ff109f927..715a25420 100644 --- a/osfmk/i386/cpu_topology.h +++ b/osfmk/i386/cpu_topology.h @@ -207,6 +207,7 @@ typedef struct x86_pkg void *pmStats; /* Power Management stats for package*/ void *pmState; /* Power Management state for package*/ struct mca_state *mca_state; /* MCA state for memory errors */ + uint64_t package_idle_exits; uint32_t num_idle; } x86_pkg_t; diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c index 46061d43a..3ca38be8b 100644 --- a/osfmk/i386/cpuid.c +++ b/osfmk/i386/cpuid.c @@ -162,6 +162,7 @@ static cpuid_cache_descriptor_t intel_cpuid_leaf2_descriptor_table[] = { { 0x70, CACHE, TRACE, 8, 12*K, NA }, { 0x71, CACHE, TRACE, 8, 16*K, NA }, { 0x72, CACHE, TRACE, 8, 32*K, NA }, + { 0x76, TLB, INST, NA, BOTH, 8 }, { 0x78, CACHE, L2, 4, 1*M, 64 }, { 0x79, CACHE, L2_2LINESECTOR, 8, 128*K, 64 }, { 0x7A, CACHE, L2_2LINESECTOR, 8, 256*K, 64 }, @@ -181,8 +182,11 @@ static cpuid_cache_descriptor_t intel_cpuid_leaf2_descriptor_table[] = { { 0xB2, TLB, INST, 4, SMALL, 64 }, { 0xB3, TLB, DATA, 4, SMALL, 128 }, { 0xB4, TLB, DATA1, 4, SMALL, 256 }, + { 0xB5, TLB, DATA1, 8, SMALL, 64 }, + { 0xB6, TLB, DATA1, 8, SMALL, 128 }, { 0xBA, TLB, DATA1, 4, BOTH, 64 }, - { 0xCA, STLB, DATA1, 4, BOTH, 512 }, + { 0xC1, STLB, DATA1, 8, SMALL, 1024}, + { 0xCA, STLB, DATA1, 4, SMALL, 512 }, { 0xD0, CACHE, L3, 4, 512*K, 64 }, { 0xD1, CACHE, L3, 4, 1*M, 64 }, { 0xD2, CACHE, L3, 4, 2*M, 64 }, @@ -663,13 +667,13 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) ctp->sensor = bitfield32(reg[eax], 0, 0); ctp->dynamic_acceleration = bitfield32(reg[eax], 1, 1); ctp->invariant_APIC_timer = bitfield32(reg[eax], 2, 2); - ctp->core_power_limits = bitfield32(reg[eax], 3, 3); - ctp->fine_grain_clock_mod = bitfield32(reg[eax], 4, 4); - ctp->package_thermal_intr = bitfield32(reg[eax], 5, 5); + ctp->core_power_limits = bitfield32(reg[eax], 4, 4); + ctp->fine_grain_clock_mod = bitfield32(reg[eax], 5, 5); + ctp->package_thermal_intr = bitfield32(reg[eax], 6, 6); ctp->thresholds = bitfield32(reg[ebx], 3, 0); ctp->ACNT_MCNT = bitfield32(reg[ecx], 0, 0); ctp->hardware_feedback = bitfield32(reg[ecx], 1, 1); - ctp->energy_policy = bitfield32(reg[ecx], 2, 2); + ctp->energy_policy = bitfield32(reg[ecx], 3, 3); info_p->cpuid_thermal_leafp = ctp; DBG(" Thermal/Power Leaf:\n"); @@ -681,7 +685,7 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) DBG(" package_thermal_intr : %d\n", ctp->package_thermal_intr); DBG(" thresholds : %d\n", ctp->thresholds); DBG(" ACNT_MCNT : %d\n", ctp->ACNT_MCNT); - DBG(" hardware_feedback : %d\n", ctp->hardware_feedback); + DBG(" ACNT2 : %d\n", ctp->hardware_feedback); DBG(" energy_policy : %d\n", ctp->energy_policy); } @@ -726,9 +730,9 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) DBG(" EDX : 0x%x\n", xsp->extended_state[edx]); } - if (info_p->cpuid_model == CPUID_MODEL_IVYBRIDGE) { + if (info_p->cpuid_model >= CPUID_MODEL_IVYBRIDGE) { /* - * XSAVE Features: + * Leaf7 Features: */ cpuid_fn(0x7, reg); info_p->cpuid_leaf7_features = reg[ebx]; @@ -777,6 +781,11 @@ cpuid_set_cpufamily(i386_cpu_info_t *info_p) case CPUID_MODEL_IVYBRIDGE: cpufamily = CPUFAMILY_INTEL_IVYBRIDGE; break; + case CPUID_MODEL_HASWELL: + case CPUID_MODEL_HASWELL_ULT: + case CPUID_MODEL_CRYSTALWELL: + cpufamily = CPUFAMILY_INTEL_HASWELL; + break; } break; } @@ -823,6 +832,7 @@ cpuid_set_info(void) info_p->thread_count = bitfield32((uint32_t)msr, 15, 0); break; } + case CPUFAMILY_INTEL_HASWELL: case CPUFAMILY_INTEL_IVYBRIDGE: case CPUFAMILY_INTEL_SANDYBRIDGE: case CPUFAMILY_INTEL_NEHALEM: { @@ -887,12 +897,13 @@ static struct table { {CPUID_FEATURE_TM2, "TM2"}, {CPUID_FEATURE_SSSE3, "SSSE3"}, {CPUID_FEATURE_CID, "CID"}, + {CPUID_FEATURE_FMA, "FMA"}, {CPUID_FEATURE_CX16, "CX16"}, {CPUID_FEATURE_xTPR, "TPR"}, {CPUID_FEATURE_PDCM, "PDCM"}, {CPUID_FEATURE_SSE4_1, "SSE4.1"}, {CPUID_FEATURE_SSE4_2, "SSE4.2"}, - {CPUID_FEATURE_xAPIC, "xAPIC"}, + {CPUID_FEATURE_x2APIC, "x2APIC"}, {CPUID_FEATURE_MOVBE, "MOVBE"}, {CPUID_FEATURE_POPCNT, "POPCNT"}, {CPUID_FEATURE_AES, "AES"}, @@ -920,8 +931,15 @@ extfeature_map[] = { }, leaf7_feature_map[] = { {CPUID_LEAF7_FEATURE_RDWRFSGS, "RDWRFSGS"}, + {CPUID_LEAF7_FEATURE_TSCOFF, "TSC_THREAD_OFFSET"}, + {CPUID_LEAF7_FEATURE_BMI1, "BMI1"}, + {CPUID_LEAF7_FEATURE_HLE, "HLE"}, {CPUID_LEAF7_FEATURE_SMEP, "SMEP"}, + {CPUID_LEAF7_FEATURE_AVX2, "AVX2"}, + {CPUID_LEAF7_FEATURE_BMI2, "BMI2"}, {CPUID_LEAF7_FEATURE_ENFSTRG, "ENFSTRG"}, + {CPUID_LEAF7_FEATURE_INVPCID, "INVPCID"}, + {CPUID_LEAF7_FEATURE_RTM, "RTM"}, {0, 0} }; diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h index 1bc3e2927..7597bc653 100644 --- a/osfmk/i386/cpuid.h +++ b/osfmk/i386/cpuid.h @@ -97,6 +97,7 @@ #define CPUID_FEATURE_SSSE3 _HBit(9) /* Supplemental SSE3 instructions */ #define CPUID_FEATURE_CID _HBit(10) /* L1 Context ID */ #define CPUID_FEATURE_SEGLIM64 _HBit(11) /* 64-bit segment limit checking */ +#define CPUID_FEATURE_FMA _HBit(12) /* Fused-Multiply-Add support */ #define CPUID_FEATURE_CX16 _HBit(13) /* CmpXchg16b instruction */ #define CPUID_FEATURE_xTPR _HBit(14) /* Send Task PRiority msgs */ #define CPUID_FEATURE_PDCM _HBit(15) /* Perf/Debug Capability MSR */ @@ -105,7 +106,7 @@ #define CPUID_FEATURE_DCA _HBit(18) /* Direct Cache Access */ #define CPUID_FEATURE_SSE4_1 _HBit(19) /* Streaming SIMD extensions 4.1 */ #define CPUID_FEATURE_SSE4_2 _HBit(20) /* Streaming SIMD extensions 4.2 */ -#define CPUID_FEATURE_xAPIC _HBit(21) /* Extended APIC Mode */ +#define CPUID_FEATURE_x2APIC _HBit(21) /* Extended APIC Mode */ #define CPUID_FEATURE_MOVBE _HBit(22) /* MOVBE instruction */ #define CPUID_FEATURE_POPCNT _HBit(23) /* POPCNT instruction */ #define CPUID_FEATURE_TSCTMR _HBit(24) /* TSC deadline timer */ @@ -126,8 +127,15 @@ * Bits returned in %ebx to a CPUID request with {%eax,%ecx} of (0x7,0x0}: */ #define CPUID_LEAF7_FEATURE_RDWRFSGS _Bit(0) /* FS/GS base read/write */ +#define CPUID_LEAF7_FEATURE_TSCOFF _Bit(1) /* TSC thread offset */ +#define CPUID_LEAF7_FEATURE_BMI1 _Bit(3) /* Bit Manipulation Instrs, set 1 */ +#define CPUID_LEAF7_FEATURE_HLE _Bit(4) /* Hardware Lock Elision*/ +#define CPUID_LEAF7_FEATURE_AVX2 _Bit(5) /* AVX2 Instructions */ #define CPUID_LEAF7_FEATURE_SMEP _Bit(7) /* Supervisor Mode Execute Protect */ +#define CPUID_LEAF7_FEATURE_BMI2 _Bit(8) /* Bit Manipulation Instrs, set 2 */ #define CPUID_LEAF7_FEATURE_ENFSTRG _Bit(9) /* ENhanced Fast STRinG copy */ +#define CPUID_LEAF7_FEATURE_INVPCID _Bit(10) /* INVPCID intruction, TDB */ +#define CPUID_LEAF7_FEATURE_RTM _Bit(11) /* TBD */ /* * The CPUID_EXTFEATURE_XXX values define 64-bit values @@ -166,6 +174,10 @@ #define CPUID_MODEL_SANDYBRIDGE 0x2A #define CPUID_MODEL_JAKETOWN 0x2D #define CPUID_MODEL_IVYBRIDGE 0x3A +#define CPUID_MODEL_HASWELL 0x3C +#define CPUID_MODEL_HASWELL_SVR 0x3F +#define CPUID_MODEL_HASWELL_ULT 0x45 +#define CPUID_MODEL_CRYSTALWELL 0x46 #define CPUID_VMM_FAMILY_UNKNOWN 0x0 diff --git a/osfmk/i386/etimer.c b/osfmk/i386/etimer.c index c834962ef..3e03db1e9 100644 --- a/osfmk/i386/etimer.c +++ b/osfmk/i386/etimer.c @@ -150,7 +150,7 @@ void etimer_set_deadline(uint64_t deadline) void etimer_resync_deadlines(void) { - uint64_t deadline; + uint64_t deadline = EndOfAllTime; uint64_t pmdeadline; rtclock_timer_t *mytimer; spl_t s = splclock(); @@ -158,7 +158,9 @@ etimer_resync_deadlines(void) uint32_t decr; pp = current_cpu_datap(); - deadline = EndOfAllTime; + if (!pp->cpu_running) + /* There's really nothing to do if this procesor is down */ + return; /* * If we have a clock timer set, pick that. diff --git a/osfmk/i386/fpu.c b/osfmk/i386/fpu.c index 1119a0a73..879851b8a 100644 --- a/osfmk/i386/fpu.c +++ b/osfmk/i386/fpu.c @@ -488,6 +488,10 @@ fpu_set_fxstate( if (fp_kind == FP_NO) return KERN_FAILURE; + if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) && + !ml_fpu_avx_enabled()) + return KERN_FAILURE; + state = (x86_float_state64_t *)tstate; assert(thr_act != THREAD_NULL); @@ -607,6 +611,10 @@ fpu_get_fxstate( if (fp_kind == FP_NO) return KERN_FAILURE; + if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) && + !ml_fpu_avx_enabled()) + return KERN_FAILURE; + state = (x86_float_state64_t *)tstate; assert(thr_act != THREAD_NULL); diff --git a/osfmk/i386/genassym.c b/osfmk/i386/genassym.c index 0c7f1f595..8595b8bb0 100644 --- a/osfmk/i386/genassym.c +++ b/osfmk/i386/genassym.c @@ -419,8 +419,6 @@ main( offsetof(cpu_data_t *,cpu_number)); DECLARE("CPU_RUNNING", offsetof(cpu_data_t *,cpu_running)); - DECLARE("CPU_MCOUNT_OFF", - offsetof(cpu_data_t *,cpu_mcount_off)); DECLARE("CPU_PENDING_AST", offsetof(cpu_data_t *,cpu_pending_ast)); DECLARE("CPU_DESC_TABLEP", diff --git a/osfmk/i386/hibernate_restore.c b/osfmk/i386/hibernate_restore.c index f04a56c4a..ba8704298 100644 --- a/osfmk/i386/hibernate_restore.c +++ b/osfmk/i386/hibernate_restore.c @@ -82,6 +82,7 @@ pal_hib_map(uintptr_t virt, uint64_t phys) index = (virt >> I386_LPGSHIFT); virt += (uintptr_t)(phys & I386_LPGMASK); phys = ((phys & ~((uint64_t)I386_LPGMASK)) | INTEL_PTE_PS | INTEL_PTE_VALID | INTEL_PTE_WRITE); + if (phys == BootPTD[index]) return (virt); BootPTD[index] = phys; invlpg(virt); BootPTD[index + 1] = (phys + I386_LPGBYTES); diff --git a/osfmk/i386/i386_init.c b/osfmk/i386/i386_init.c index 39102c926..910194f59 100644 --- a/osfmk/i386/i386_init.c +++ b/osfmk/i386/i386_init.c @@ -578,7 +578,6 @@ i386_init(void) tsc_init(); power_management_init(); - processor_bootstrap(); thread_bootstrap(); @@ -608,6 +607,7 @@ do_init_slave(boolean_t fast_restart) mca_cpu_init(); #endif + LAPIC_INIT(); lapic_configure(); LAPIC_DUMP(); LAPIC_CPU_MAP_DUMP(); @@ -617,12 +617,11 @@ do_init_slave(boolean_t fast_restart) #if CONFIG_MTRR mtrr_update_cpu(); #endif + /* update CPU microcode */ + ucode_update_wake(); } else init_param = FAST_SLAVE_INIT; - /* update CPU microcode */ - ucode_update_wake(); - #if CONFIG_VMX /* resume VT operation */ vmx_resume(); diff --git a/osfmk/i386/lapic.c b/osfmk/i386/lapic.c index b365d6070..0e74dd06e 100644 --- a/osfmk/i386/lapic.c +++ b/osfmk/i386/lapic.c @@ -48,7 +48,6 @@ /* Base vector for local APIC interrupt sources */ int lapic_interrupt_base = LAPIC_DEFAULT_INTERRUPT_BASE; -#define MAX_LAPICIDS (LAPIC_ID_MAX+1) int lapic_to_cpu[MAX_LAPICIDS]; int cpu_to_lapic[MAX_CPUS]; diff --git a/osfmk/i386/lapic.h b/osfmk/i386/lapic.h index 378f4d7eb..219332ae4 100644 --- a/osfmk/i386/lapic.h +++ b/osfmk/i386/lapic.h @@ -174,15 +174,20 @@ typedef enum { #define LAPIC_MSR(reg) (LAPIC_MSR_BASE + LAPIC_MSR_OFFSET(reg)) typedef struct { - void (*init) (void); - uint32_t (*read) (lapic_register_t); - void (*write)(lapic_register_t, uint32_t); + void (*init) (void); + uint32_t (*read) (lapic_register_t); + void (*write) (lapic_register_t, uint32_t); + uint64_t (*read_icr) (void); + void (*write_icr) (uint32_t, uint32_t); } lapic_ops_table_t; extern lapic_ops_table_t *lapic_ops; +#define LAPIC_INIT() lapic_ops->init(); #define LAPIC_WRITE(reg,val) lapic_ops->write(reg, val) #define LAPIC_READ(reg) lapic_ops->read(reg) #define LAPIC_READ_OFFSET(reg,off) LAPIC_READ((reg)+(off)) +#define LAPIC_READ_ICR() lapic_ops->read_icr() +#define LAPIC_WRITE_ICR(dst,cmd) lapic_ops->write_icr(dst, cmd) typedef enum { periodic, @@ -225,6 +230,7 @@ typedef uint32_t lapic_timer_count_t; #define LAPIC_PM_INTERRUPT 0x7 #define LAPIC_PMC_SWI_VECTOR (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_PMC_SW_INTERRUPT) +#define LAPIC_TIMER_VECTOR (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_TIMER_INTERRUPT) /* The vector field is ignored for NMI interrupts via the LAPIC * or otherwise, so this is not an offset from the interrupt @@ -314,8 +320,8 @@ extern boolean_t lapic_is_interrupting(uint8_t vector); extern void lapic_interrupt_counts(uint64_t intrs[256]); extern void lapic_disable_timer(void); +#define MAX_LAPICIDS (LAPIC_ID_MAX+1) #ifdef MP_DEBUG -extern void lapic_cpu_map_dump(void); #define LAPIC_CPU_MAP_DUMP() lapic_cpu_map_dump() #define LAPIC_DUMP() lapic_dump() #else diff --git a/osfmk/i386/lapic_native.c b/osfmk/i386/lapic_native.c index 3e6991974..347b9e969 100644 --- a/osfmk/i386/lapic_native.c +++ b/osfmk/i386/lapic_native.c @@ -116,33 +116,42 @@ legacy_init(void) vm_map_offset_t lapic_vbase64; /* Establish a map to the local apic */ - lapic_vbase64 = (vm_offset_t)vm_map_min(kernel_map); - result = vm_map_find_space(kernel_map, - &lapic_vbase64, - round_page(LAPIC_SIZE), 0, - VM_MAKE_TAG(VM_MEMORY_IOKIT), &entry); - /* Convert 64-bit vm_map_offset_t to "pointer sized" vm_offset_t - */ - lapic_vbase = (vm_offset_t) lapic_vbase64; - if (result != KERN_SUCCESS) { - panic("legacy_init: vm_map_find_entry FAILED (err=%d)", result); + if (lapic_vbase == 0) { + lapic_vbase64 = (vm_offset_t)vm_map_min(kernel_map); + result = vm_map_find_space(kernel_map, + &lapic_vbase64, + round_page(LAPIC_SIZE), 0, + VM_MAKE_TAG(VM_MEMORY_IOKIT), &entry); + /* Convert 64-bit vm_map_offset_t to "pointer sized" vm_offset_t + */ + lapic_vbase = (vm_offset_t) lapic_vbase64; + if (result != KERN_SUCCESS) { + panic("legacy_init: vm_map_find_entry FAILED (err=%d)", result); + } + vm_map_unlock(kernel_map); + + /* + * Map in the local APIC non-cacheable, as recommended by Intel + * in section 8.4.1 of the "System Programming Guide". + * In fact, this is redundant because EFI will have assigned an + * MTRR physical range containing the local APIC's MMIO space as + * UC and this will override the default PAT setting. + */ + pmap_enter(pmap_kernel(), + lapic_vbase, + (ppnum_t) i386_btop(lapic_pbase), + VM_PROT_READ|VM_PROT_WRITE, + VM_PROT_NONE, + VM_WIMG_IO, + TRUE); } - vm_map_unlock(kernel_map); /* - * Map in the local APIC non-cacheable, as recommended by Intel - * in section 8.4.1 of the "System Programming Guide". - * In fact, this is redundant because EFI will have assigned an - * MTRR physical range containing the local APIC's MMIO space as - * UC and this will override the default PAT setting. + * Set flat delivery model, logical processor id + * This should already be the default set. */ - pmap_enter(pmap_kernel(), - lapic_vbase, - (ppnum_t) i386_btop(lapic_pbase), - VM_PROT_READ|VM_PROT_WRITE, - VM_PROT_NONE, - VM_WIMG_IO, - TRUE); + LAPIC_WRITE(DFR, LAPIC_DFR_FLAT); + LAPIC_WRITE(LDR, (get_cpu_number()) << LAPIC_LDR_SHIFT); } @@ -158,15 +167,41 @@ legacy_write(lapic_register_t reg, uint32_t value) *LAPIC_MMIO(reg) = value; } +static uint64_t +legacy_read_icr(void) +{ + return (((uint64_t)*LAPIC_MMIO(ICRD)) << 32) | ((uint64_t)*LAPIC_MMIO(ICR)); +} + +static void +legacy_write_icr(uint32_t dst, uint32_t cmd) +{ + *LAPIC_MMIO(ICRD) = dst << LAPIC_ICRD_DEST_SHIFT; + *LAPIC_MMIO(ICR) = cmd; +} + static lapic_ops_table_t legacy_ops = { legacy_init, legacy_read, - legacy_write + legacy_write, + legacy_read_icr, + legacy_write_icr }; +static boolean_t is_x2apic = FALSE; + static void x2apic_init(void) { + uint32_t lo; + uint32_t hi; + + rdmsr(MSR_IA32_APIC_BASE, lo, hi); + if ((lo & MSR_IA32_APIC_BASE_EXTENDED) == 0) { + lo |= MSR_IA32_APIC_BASE_EXTENDED; + wrmsr(MSR_IA32_APIC_BASE, lo, hi); + kprintf("x2APIC mode enabled\n"); + } } static uint32_t @@ -185,13 +220,26 @@ x2apic_write(lapic_register_t reg, uint32_t value) wrmsr(LAPIC_MSR(reg), value, 0); } +static uint64_t +x2apic_read_icr(void) +{ + return rdmsr64(LAPIC_MSR(ICR));; +} + +static void +x2apic_write_icr(uint32_t dst, uint32_t cmd) +{ + wrmsr(LAPIC_MSR(ICR), cmd, dst); +} + static lapic_ops_table_t x2apic_ops = { x2apic_init, x2apic_read, - x2apic_write + x2apic_write, + x2apic_read_icr, + x2apic_write_icr }; - void lapic_init(void) { @@ -199,7 +247,6 @@ lapic_init(void) uint32_t hi; boolean_t is_boot_processor; boolean_t is_lapic_enabled; - boolean_t is_x2apic; /* Examine the local APIC state */ rdmsr(MSR_IA32_APIC_BASE, lo, hi); @@ -214,10 +261,21 @@ lapic_init(void) if (!is_boot_processor || !is_lapic_enabled) panic("Unexpected local APIC state\n"); + /* + * If x2APIC is available and not already enabled, enable it. + * Unless overriden by boot-arg. + */ + if (!is_x2apic && (cpuid_features() & CPUID_FEATURE_x2APIC)) { + PE_parse_boot_argn("-x2apic", &is_x2apic, sizeof(is_x2apic)); + kprintf("x2APIC supported %s be enabled\n", + is_x2apic ? "and will" : "but will not"); + } + lapic_ops = is_x2apic ? &x2apic_ops : &legacy_ops; - lapic_ops->init(); + LAPIC_INIT(); + kprintf("ID: 0x%x LDR: 0x%x\n", LAPIC_READ(ID), LAPIC_READ(LDR)); if ((LAPIC_READ(VERSION)&LAPIC_VERSION_MASK) < 0x14) { panic("Local APIC version 0x%x, 0x14 or more expected\n", (LAPIC_READ(VERSION)&LAPIC_VERSION_MASK)); @@ -290,7 +348,7 @@ lapic_dump(void) LAPIC_READ(APR)&LAPIC_APR_MASK, LAPIC_READ(PPR)&LAPIC_PPR_MASK); kprintf("Destination Format 0x%x Logical Destination 0x%x\n", - LAPIC_READ(DFR)>>LAPIC_DFR_SHIFT, + is_x2apic ? 0 : LAPIC_READ(DFR)>>LAPIC_DFR_SHIFT, LAPIC_READ(LDR)>>LAPIC_LDR_SHIFT); kprintf("%cEnabled %cFocusChecking SV 0x%x\n", BOOL(LAPIC_READ(SVR)&LAPIC_SVR_ENABLE), @@ -449,10 +507,6 @@ lapic_configure(void) } } - /* Set flat delivery model, logical processor id */ - LAPIC_WRITE(DFR, LAPIC_DFR_FLAT); - LAPIC_WRITE(LDR, (get_cpu_number()) << LAPIC_LDR_SHIFT); - /* Accept all */ LAPIC_WRITE(TPR, 0); @@ -801,12 +855,11 @@ lapic_send_ipi(int cpu, int vector) state = ml_set_interrupts_enabled(FALSE); /* Wait for pending outgoing send to complete */ - while (LAPIC_READ(ICR) & LAPIC_ICR_DS_PENDING) { + while (LAPIC_READ_ICR() & LAPIC_ICR_DS_PENDING) { cpu_pause(); } - LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, vector | LAPIC_ICR_DM_FIXED); + LAPIC_WRITE_ICR(cpu_to_lapic[cpu], vector | LAPIC_ICR_DM_FIXED); (void) ml_set_interrupts_enabled(state); } diff --git a/osfmk/i386/machine_check.c b/osfmk/i386/machine_check.c index 64b0c9b8d..c70168b3c 100644 --- a/osfmk/i386/machine_check.c +++ b/osfmk/i386/machine_check.c @@ -89,12 +89,19 @@ static void mca_get_availability(void) { uint64_t features = cpuid_info()->cpuid_features; - uint32_t family = cpuid_info()->cpuid_family; + uint32_t family = cpuid_info()->cpuid_family; + uint32_t model = cpuid_info()->cpuid_model; + uint32_t stepping = cpuid_info()->cpuid_stepping; mca_MCE_present = (features & CPUID_FEATURE_MCE) != 0; mca_MCA_present = (features & CPUID_FEATURE_MCA) != 0; mca_family = family; - + + if ((model == CPUID_MODEL_HASWELL && stepping < 3) || + (model == CPUID_MODEL_HASWELL_ULT && stepping < 1) || + (model == CPUID_MODEL_CRYSTALWELL && stepping < 1)) + panic("Haswell pre-C0 steppings are not supported"); + /* * If MCA, the number of banks etc is reported by the IA32_MCG_CAP MSR. */ diff --git a/osfmk/i386/machine_routines.c b/osfmk/i386/machine_routines.c index e008a9a61..721806047 100644 --- a/osfmk/i386/machine_routines.c +++ b/osfmk/i386/machine_routines.c @@ -567,9 +567,9 @@ ml_init_lock_timeout(void) * instead of spinning for clock_delay_until(). */ void -ml_init_delay_spin_threshold(void) +ml_init_delay_spin_threshold(int threshold_us) { - nanoseconds_to_absolutetime(10ULL * NSEC_PER_USEC, &delay_spin_threshold); + nanoseconds_to_absolutetime(threshold_us * NSEC_PER_USEC, &delay_spin_threshold); } boolean_t @@ -579,7 +579,7 @@ ml_delay_should_spin(uint64_t interval) } /* - * This is called from the machine-independent routine cpu_up() + * This is called from the machine-independent layer * to perform machine-dependent info updates. Defer to cpu_thread_init(). */ void @@ -589,12 +589,14 @@ ml_cpu_up(void) } /* - * This is called from the machine-independent routine cpu_down() + * This is called from the machine-independent layer * to perform machine-dependent info updates. */ void ml_cpu_down(void) { + i386_deactivate_cpu(); + return; } diff --git a/osfmk/i386/machine_routines.h b/osfmk/i386/machine_routines.h index d800625f7..270ddcfb5 100644 --- a/osfmk/i386/machine_routines.h +++ b/osfmk/i386/machine_routines.h @@ -80,7 +80,7 @@ void ml_install_interrupt_handler( void ml_get_timebase(unsigned long long *timestamp); void ml_init_lock_timeout(void); -void ml_init_delay_spin_threshold(void); +void ml_init_delay_spin_threshold(int); boolean_t ml_delay_should_spin(uint64_t interval); diff --git a/osfmk/i386/misc_protos.h b/osfmk/i386/misc_protos.h index 0007b3321..4a2ed207d 100644 --- a/osfmk/i386/misc_protos.h +++ b/osfmk/i386/misc_protos.h @@ -173,4 +173,5 @@ extern void act_machine_switch_pcb(thread_t old, thread_t new); #define FAST_SLAVE_INIT ((void *)(uintptr_t)1) uint64_t ml_early_random(void); +void cpu_pmc_control(void *); #endif /* _I386_MISC_PROTOS_H_ */ diff --git a/osfmk/i386/mp.c b/osfmk/i386/mp.c index b66399d2d..286b822aa 100644 --- a/osfmk/i386/mp.c +++ b/osfmk/i386/mp.c @@ -93,6 +93,9 @@ #define TRACE_MP_CPUS_CALL_LOCAL MACHDBG_CODE(DBG_MACH_MP, 2) #define TRACE_MP_CPUS_CALL_ACTION MACHDBG_CODE(DBG_MACH_MP, 3) #define TRACE_MP_CPUS_CALL_NOBUF MACHDBG_CODE(DBG_MACH_MP, 4) +#define TRACE_MP_CPU_FAST_START MACHDBG_CODE(DBG_MACH_MP, 5) +#define TRACE_MP_CPU_START MACHDBG_CODE(DBG_MACH_MP, 6) +#define TRACE_MP_CPU_DEACTIVATE MACHDBG_CODE(DBG_MACH_MP, 7) #define ABS(v) (((v) > 0)?(v):-(v)) @@ -287,6 +290,10 @@ intel_startCPU_fast(int slot_num) */ return(rc); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_FAST_START | DBG_FUNC_START, + slot_num, 0, 0, 0, 0); + /* * Wait until the CPU is back online. */ @@ -301,6 +308,10 @@ intel_startCPU_fast(int slot_num) mp_wait_for_cpu_up(slot_num, 30000, 1); mp_enable_preemption(); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_FAST_START | DBG_FUNC_END, + slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0); + /* * Check to make sure that the CPU is really running. If not, * go through the slow path. @@ -341,13 +352,30 @@ start_cpu(void *arg) if (cpu_number() != psip->starter_cpu) return; + DBG("start_cpu(%p) about to start cpu %d, lapic %d\n", + arg, psip->target_cpu, psip->target_lapic); + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_START | DBG_FUNC_START, + psip->target_cpu, + psip->target_lapic, 0, 0, 0); + i386_start_cpu(psip->target_lapic, psip->target_cpu); #ifdef POSTCODE_DELAY /* Wait much longer if postcodes are displayed for a delay period. */ i *= 10000; #endif + DBG("start_cpu(%p) about to wait for cpu %d\n", + arg, psip->target_cpu); + mp_wait_for_cpu_up(psip->target_cpu, i*100, 100); + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_START | DBG_FUNC_END, + psip->target_cpu, + cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0); + if (TSC_sync_margin && cpu_datap(psip->target_cpu)->cpu_running) { /* @@ -1293,26 +1321,43 @@ i386_deactivate_cpu(void) cpu_data_t *cdp = current_cpu_datap(); assert(!ml_get_interrupts_enabled()); + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START, + 0, 0, 0, 0, 0); simple_lock(&x86_topo_lock); cdp->cpu_running = FALSE; simple_unlock(&x86_topo_lock); + /* + * Move all of this cpu's timers to the master/boot cpu, + * and poke it in case there's a sooner deadline for it to schedule. + */ timer_queue_shutdown(&cdp->rtclock_timer.queue); - cdp->rtclock_timer.deadline = EndOfAllTime; mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, etimer_timer_expire, NULL); /* - * In case a rendezvous/braodcast/call was initiated to this cpu - * before we cleared cpu_running, we must perform any actions due. + * Open an interrupt window + * and ensure any pending IPI or timer is serviced */ - if (i_bit(MP_RENDEZVOUS, &cdp->cpu_signals)) - mp_rendezvous_action(); - if (i_bit(MP_BROADCAST, &cdp->cpu_signals)) - mp_broadcast_action(); - if (i_bit(MP_CALL, &cdp->cpu_signals)) - mp_cpus_call_action(); - cdp->cpu_signals = 0; /* all clear */ + mp_disable_preemption(); + ml_set_interrupts_enabled(TRUE); + + while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime) + cpu_pause(); + /* + * Ensure there's no remaining timer deadline set + * - AICPM may have left one active. + */ + setPop(0); + + ml_set_interrupts_enabled(FALSE); + mp_enable_preemption(); + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END, + 0, 0, 0, 0, 0); } int pmsafe_debug = 1; @@ -1424,7 +1469,7 @@ mp_kdp_enter(void) cpu_NMI_interrupt(cpu); } - DBG("mp_kdp_enter() %u processors done %s\n", + DBG("mp_kdp_enter() %d processors done %s\n", (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out"); postcode(MP_KDP_ENTER); @@ -1479,7 +1524,7 @@ mp_kdp_wait(boolean_t flush, boolean_t isNMI) DBG("mp_kdp_wait()\n"); /* If an I/O port has been specified as a debugging aid, issue a read */ panic_io_port_read(); - + current_cpu_datap()->debugger_ipi_time = mach_absolute_time(); #if CONFIG_MCA /* If we've trapped due to a machine-check, save MCA registers */ mca_check_save(); @@ -1581,6 +1626,8 @@ slave_machine_init(void *param) clock_init(); cpu_machine_init(); /* Interrupts enabled hereafter */ mp_cpus_call_cpu_init(); + } else { + cpu_machine_init(); /* Interrupts enabled hereafter */ } } diff --git a/osfmk/i386/mp_native.c b/osfmk/i386/mp_native.c index ea3799780..c013a149b 100644 --- a/osfmk/i386/mp_native.c +++ b/osfmk/i386/mp_native.c @@ -68,12 +68,10 @@ i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler, int ipi_vector, i386 void i386_start_cpu(int lapic_id, __unused int cpu_num ) { - LAPIC_WRITE(ICRD, lapic_id << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, LAPIC_ICR_DM_INIT); + LAPIC_WRITE_ICR(lapic_id, LAPIC_ICR_DM_INIT); delay(100); - - LAPIC_WRITE(ICRD, lapic_id << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(REAL_MODE_BOOTSTRAP_OFFSET>>12)); + LAPIC_WRITE_ICR(lapic_id, + LAPIC_ICR_DM_STARTUP|(REAL_MODE_BOOTSTRAP_OFFSET>>12)); } void @@ -81,11 +79,11 @@ i386_send_NMI(int cpu) { boolean_t state = ml_set_interrupts_enabled(FALSE); /* Program the interrupt command register */ - LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT); /* The vector is ignored in this case--the target CPU will enter on the * NMI vector. */ - LAPIC_WRITE(ICR, LAPIC_VECTOR(INTERPROCESSOR)|LAPIC_ICR_DM_NMI); + LAPIC_WRITE_ICR(cpu_to_lapic[cpu], + LAPIC_VECTOR(INTERPROCESSOR)|LAPIC_ICR_DM_NMI); (void) ml_set_interrupts_enabled(state); } diff --git a/osfmk/i386/pal_native.h b/osfmk/i386/pal_native.h index 13cbf69fb..1979983a8 100644 --- a/osfmk/i386/pal_native.h +++ b/osfmk/i386/pal_native.h @@ -72,10 +72,8 @@ struct pal_rtc_nanotime { volatile uint64_t tsc_base; /* timestamp */ volatile uint64_t ns_base; /* nanoseconds */ uint32_t scale; /* tsc -> nanosec multiplier */ - uint32_t shift; /* tsc -> nanosec shift/div */ - /* shift is overloaded with - * lower 32bits of tsc_freq - * on slower machines (SLOW_TSC_THRESHOLD) */ + uint32_t shift; /* shift is nonzero only on "slow" machines, */ + /* ie where tscFreq <= SLOW_TSC_THRESHOLD */ volatile uint32_t generation; /* 0 == being updated */ uint32_t spare1; }; diff --git a/osfmk/i386/pal_routines.h b/osfmk/i386/pal_routines.h index 7e0112fbb..4945a66c4 100644 --- a/osfmk/i386/pal_routines.h +++ b/osfmk/i386/pal_routines.h @@ -1,4 +1,3 @@ - /* * Copyright (c) 2009 Apple Inc. All rights reserved. * @@ -155,6 +154,7 @@ void pal_efi_hibernate_prepare(void); /* Include a PAL-specific header, too, for xnu-internal overrides */ #include + extern boolean_t virtualized; #define PAL_VIRTUALIZED_PROPERTY_VALUE 4 diff --git a/osfmk/i386/pcb.c b/osfmk/i386/pcb.c index a70348b33..e5274a5eb 100644 --- a/osfmk/i386/pcb.c +++ b/osfmk/i386/pcb.c @@ -991,6 +991,31 @@ machine_thread_set_state( return fpu_set_fxstate(thr_act, tstate, flavor); } + case x86_AVX_STATE: + { + x86_avx_state_t *state; + + if (count != x86_AVX_STATE_COUNT) + return(KERN_INVALID_ARGUMENT); + + state = (x86_avx_state_t *)tstate; + if (state->ash.flavor == x86_AVX_STATE64 && + state->ash.count == x86_FLOAT_STATE64_COUNT && + thread_is_64bit(thr_act)) { + return fpu_set_fxstate(thr_act, + (thread_state_t)&state->ufs.as64, + x86_FLOAT_STATE64); + } + if (state->ash.flavor == x86_FLOAT_STATE32 && + state->ash.count == x86_FLOAT_STATE32_COUNT && + !thread_is_64bit(thr_act)) { + return fpu_set_fxstate(thr_act, + (thread_state_t)&state->ufs.as32, + x86_FLOAT_STATE32); + } + return(KERN_INVALID_ARGUMENT); + } + case x86_THREAD_STATE32: { if (count != x86_THREAD_STATE32_COUNT) @@ -1137,6 +1162,21 @@ machine_thread_get_state( break; } + case THREAD_STATE_FLAVOR_LIST_10_9: + { + if (*count < 5) + return (KERN_INVALID_ARGUMENT); + + tstate[0] = x86_THREAD_STATE; + tstate[1] = x86_FLOAT_STATE; + tstate[2] = x86_EXCEPTION_STATE; + tstate[3] = x86_DEBUG_STATE; + tstate[4] = x86_AVX_STATE; + + *count = 5; + break; + } + case x86_SAVED_STATE32: { x86_saved_state32_t *state; @@ -1245,8 +1285,8 @@ machine_thread_get_state( return(kret); } - case x86_AVX_STATE32: - { + case x86_AVX_STATE32: + { if (*count != x86_AVX_STATE32_COUNT) return(KERN_INVALID_ARGUMENT); @@ -1256,10 +1296,10 @@ machine_thread_get_state( *count = x86_AVX_STATE32_COUNT; return fpu_get_fxstate(thr_act, tstate, flavor); - } + } - case x86_AVX_STATE64: - { + case x86_AVX_STATE64: + { if (*count != x86_AVX_STATE64_COUNT) return(KERN_INVALID_ARGUMENT); @@ -1269,7 +1309,36 @@ machine_thread_get_state( *count = x86_AVX_STATE64_COUNT; return fpu_get_fxstate(thr_act, tstate, flavor); - } + } + + case x86_AVX_STATE: + { + x86_avx_state_t *state; + kern_return_t kret; + + if (*count < x86_AVX_STATE_COUNT) + return(KERN_INVALID_ARGUMENT); + + state = (x86_avx_state_t *)tstate; + + bzero((char *)state, sizeof(x86_avx_state_t)); + if (thread_is_64bit(thr_act)) { + state->ash.flavor = x86_AVX_STATE64; + state->ash.count = x86_AVX_STATE64_COUNT; + kret = fpu_get_fxstate(thr_act, + (thread_state_t)&state->ufs.as64, + x86_AVX_STATE64); + } else { + state->ash.flavor = x86_AVX_STATE32; + state->ash.count = x86_AVX_STATE32_COUNT; + kret = fpu_get_fxstate(thr_act, + (thread_state_t)&state->ufs.as32, + x86_AVX_STATE32); + } + *count = x86_AVX_STATE_COUNT; + + return(kret); + } case x86_THREAD_STATE32: { diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c index ec5ae7f78..d9c7ff9fb 100644 --- a/osfmk/i386/pmCPU.c +++ b/osfmk/i386/pmCPU.c @@ -49,19 +49,21 @@ #include #include #include - #include extern int disableConsoleOutput; #define DELAY_UNSET 0xFFFFFFFFFFFFFFFFULL +uint64_t cpu_itime_bins[CPU_ITIME_BINS] = {16* NSEC_PER_USEC, 32* NSEC_PER_USEC, 64* NSEC_PER_USEC, 128* NSEC_PER_USEC, 256* NSEC_PER_USEC, 512* NSEC_PER_USEC, 1024* NSEC_PER_USEC, 2048* NSEC_PER_USEC, 4096* NSEC_PER_USEC, 8192* NSEC_PER_USEC, 16384* NSEC_PER_USEC, 32768* NSEC_PER_USEC}; +uint64_t *cpu_rtime_bins = &cpu_itime_bins[0]; + /* * The following is set when the KEXT loads and initializes. */ pmDispatch_t *pmDispatch = NULL; -static uint32_t pmInitDone = 0; +uint32_t pmInitDone = 0; static boolean_t earlyTopology = FALSE; static uint64_t earlyMaxBusDelay = DELAY_UNSET; static uint64_t earlyMaxIntDelay = DELAY_UNSET; @@ -76,13 +78,15 @@ power_management_init(void) (*pmDispatch->cstateInit)(); } -#define CPU_ACTIVE_STAT_BIN_1 (500000) -#define CPU_ACTIVE_STAT_BIN_2 (2000000) -#define CPU_ACTIVE_STAT_BIN_3 (5000000) - -#define CPU_IDLE_STAT_BIN_1 (500000) -#define CPU_IDLE_STAT_BIN_2 (2000000) -#define CPU_IDLE_STAT_BIN_3 (5000000) +static inline void machine_classify_interval(uint64_t interval, uint64_t *bins, uint64_t *binvals, uint32_t nbins) { + uint32_t i; + for (i = 0; i < nbins; i++) { + if (interval < binvals[i]) { + bins[i]++; + break; + } + } +} /* * Called when the CPU is idle. It calls into the power management kext @@ -91,92 +95,77 @@ power_management_init(void) void machine_idle(void) { - cpu_data_t *my_cpu = current_cpu_datap(); - uint64_t ctime, rtime, itime; + cpu_data_t *my_cpu = current_cpu_datap(); + uint64_t ctime, rtime, itime; - if (my_cpu == NULL) - goto out; + if (my_cpu == NULL) + goto out; ctime = mach_absolute_time(); - my_cpu->lcpu.state = LCPU_IDLE; - DBGLOG(cpu_handle, cpu_number(), MP_IDLE); - MARK_CPU_IDLE(cpu_number()); + my_cpu->lcpu.state = LCPU_IDLE; + DBGLOG(cpu_handle, cpu_number(), MP_IDLE); + MARK_CPU_IDLE(cpu_number()); rtime = ctime - my_cpu->cpu_ixtime; my_cpu->cpu_rtime_total += rtime; + machine_classify_interval(rtime, &my_cpu->cpu_rtimes[0], &cpu_rtime_bins[0], CPU_RTIME_BINS); + + if (pmInitDone) { + /* + * Handle case where ml_set_maxbusdelay() or ml_set_maxintdelay() + * were called prior to the CPU PM kext being registered. We do + * this here since we know at this point the values will be first + * used since idle is where the decisions using these values is made. + */ + if (earlyMaxBusDelay != DELAY_UNSET) + ml_set_maxbusdelay((uint32_t)(earlyMaxBusDelay & 0xFFFFFFFF)); + + if (earlyMaxIntDelay != DELAY_UNSET) + ml_set_maxintdelay(earlyMaxIntDelay); + } - if (rtime < CPU_ACTIVE_STAT_BIN_1) - my_cpu->cpu_rtimes[0]++; - else if (rtime < CPU_ACTIVE_STAT_BIN_2) - my_cpu->cpu_rtimes[1]++; - else if (rtime < CPU_ACTIVE_STAT_BIN_3) - my_cpu->cpu_rtimes[2]++; - else - my_cpu->cpu_rtimes[3]++; - + if (pmInitDone + && pmDispatch != NULL + && pmDispatch->MachineIdle != NULL) + (*pmDispatch->MachineIdle)(0x7FFFFFFFFFFFFFFFULL); + else { + /* + * If no power management, re-enable interrupts and halt. + * This will keep the CPU from spinning through the scheduler + * and will allow at least some minimal power savings (but it + * cause problems in some MP configurations w.r.t. the APIC + * stopping during a GV3 transition). + */ + pal_hlt(); + + /* Once woken, re-disable interrupts. */ + pal_cli(); + } - if (pmInitDone) { /* - * Handle case where ml_set_maxbusdelay() or ml_set_maxintdelay() - * were called prior to the CPU PM kext being registered. We do - * this here since we know at this point the values will be first - * used since idle is where the decisions using these values is made. + * Mark the CPU as running again. */ - if (earlyMaxBusDelay != DELAY_UNSET) - ml_set_maxbusdelay((uint32_t)(earlyMaxBusDelay & 0xFFFFFFFF)); - - if (earlyMaxIntDelay != DELAY_UNSET) - ml_set_maxintdelay(earlyMaxIntDelay); - } - - if (pmInitDone - && pmDispatch != NULL - && pmDispatch->MachineIdle != NULL) - (*pmDispatch->MachineIdle)(0x7FFFFFFFFFFFFFFFULL); - else { - /* - * If no power management, re-enable interrupts and halt. - * This will keep the CPU from spinning through the scheduler - * and will allow at least some minimal power savings (but it - * cause problems in some MP configurations w.r.t. the APIC - * stopping during a GV3 transition). - */ - pal_hlt(); - - /* Once woken, re-disable interrupts. */ - pal_cli(); - } - - /* - * Mark the CPU as running again. - */ - MARK_CPU_ACTIVE(cpu_number()); - DBGLOG(cpu_handle, cpu_number(), MP_UNIDLE); + MARK_CPU_ACTIVE(cpu_number()); + DBGLOG(cpu_handle, cpu_number(), MP_UNIDLE); uint64_t ixtime = my_cpu->cpu_ixtime = mach_absolute_time(); - itime = ixtime - ctime; + my_cpu->cpu_idle_exits++; - my_cpu->lcpu.state = LCPU_RUN; + itime = ixtime - ctime; - if (itime < CPU_IDLE_STAT_BIN_1) - my_cpu->cpu_itimes[0]++; - else if (itime < CPU_IDLE_STAT_BIN_2) - my_cpu->cpu_itimes[1]++; - else if (itime < CPU_IDLE_STAT_BIN_3) - my_cpu->cpu_itimes[2]++; - else - my_cpu->cpu_itimes[3]++; + my_cpu->lcpu.state = LCPU_RUN; + machine_classify_interval(itime, &my_cpu->cpu_itimes[0], &cpu_itime_bins[0], CPU_ITIME_BINS); my_cpu->cpu_itime_total += itime; - /* - * Re-enable interrupts. - */ - out: - pal_sti(); + /* + * Re-enable interrupts. + */ +out: + pal_sti(); } /* @@ -200,6 +189,7 @@ pmCPUHalt(uint32_t reason) break; case PM_HALT_NORMAL: + case PM_HALT_SLEEP: default: pal_cli(); @@ -212,11 +202,14 @@ pmCPUHalt(uint32_t reason) (*pmDispatch->pmCPUHalt)(); /* - * We've exited halt, so get the the CPU schedulable again. + * We've exited halt, so get the CPU schedulable again. + * - by calling the fast init routine for a slave, or + * - by returning if we're the master processor. */ - i386_init_slave_fast(); - - panic("init_slave_fast returned"); + if (cpup->cpu_number != master_cpu) { + i386_init_slave_fast(); + panic("init_slave_fast returned"); + } } else { /* @@ -257,13 +250,13 @@ pmInitComplete(void) pmInitDone = 1; } -static x86_lcpu_t * +x86_lcpu_t * pmGetLogicalCPU(int cpu) { return(cpu_to_lcpu(cpu)); } -static x86_lcpu_t * +x86_lcpu_t * pmGetMyLogicalCPU(void) { cpu_data_t *cpup = current_cpu_datap(); @@ -758,7 +751,7 @@ pmGetSavedRunCount(void) /* * Returns the root of the package tree. */ -static x86_pkg_t * +x86_pkg_t * pmGetPkgRoot(void) { return(x86_pkgs); @@ -770,7 +763,7 @@ pmCPUGetHibernate(int cpu) return(cpu_datap(cpu)->cpu_hibernate); } -static processor_t +processor_t pmLCPUtoProcessor(int lcpu) { return(cpu_datap(lcpu)->cpu_processor); @@ -814,7 +807,7 @@ pmGetNanotimeInfo(pm_rtc_nanotime_t *rtc_nanotime) && rtc_nanotime->generation != pal_rtc_nanotime_info.generation); } -static uint32_t +uint32_t pmTimerQueueMigrate(int target_cpu) { /* Call the etimer code to do this. */ @@ -867,6 +860,10 @@ pmKextRegister(uint32_t version, pmDispatch_t *cpuFuncs, } if (cpuFuncs != NULL) { + if (pmDispatch) { + panic("Attempt to re-register power management interface--AICPM present in xcpm mode? %p->%p", pmDispatch, cpuFuncs); + } + pmDispatch = cpuFuncs; if (earlyTopology @@ -938,7 +935,10 @@ void machine_track_platform_idle(boolean_t entry) { if (entry) { (void)__sync_fetch_and_add(&my_cpu->lcpu.package->num_idle, 1); } - else { - (void)__sync_fetch_and_sub(&my_cpu->lcpu.package->num_idle, 1); - } + else { + uint32_t nidle = __sync_fetch_and_sub(&my_cpu->lcpu.package->num_idle, 1); + if (nidle == topoParms.nLThreadsPerPackage) { + my_cpu->lcpu.package->package_idle_exits++; + } + } } diff --git a/osfmk/i386/pmCPU.h b/osfmk/i386/pmCPU.h index c443c1efa..1ed973e4c 100644 --- a/osfmk/i386/pmCPU.h +++ b/osfmk/i386/pmCPU.h @@ -141,6 +141,7 @@ void pmTimerSave(void); void pmTimerRestore(void); kern_return_t pmCPUExitHalt(int cpu); kern_return_t pmCPUExitHaltToOff(int cpu); +uint32_t pmTimerQueueMigrate(int); #define PM_HALT_NORMAL 0 /* normal halt path */ #define PM_HALT_DEBUG 1 /* debug code wants to halt */ @@ -159,6 +160,16 @@ extern int pmsafe_debug; #define URGENCY_NOTIFICATION_ASSERT_NS (5 * 1000 * 1000) extern uint64_t urgency_notification_assert_abstime_threshold; +x86_lcpu_t * +pmGetLogicalCPU(int cpu); +x86_lcpu_t * +pmGetMyLogicalCPU(void); +processor_t +pmLCPUtoProcessor(int lcpu); +x86_pkg_t * +pmGetPkgRoot(void); + + /****************************************************************************** * * All of the following are deprecated interfaces and no longer used. diff --git a/osfmk/i386/proc_reg.h b/osfmk/i386/proc_reg.h index 6438d9372..019d0aebe 100644 --- a/osfmk/i386/proc_reg.h +++ b/osfmk/i386/proc_reg.h @@ -397,6 +397,8 @@ static inline void flush_tlb_raw(void) set_cr3_raw(get_cr3_raw()); } #endif +extern int rdmsr64_carefully(uint32_t msr, uint64_t *val); +extern int wrmsr64_carefully(uint32_t msr, uint64_t val); #endif /* MACH_KERNEL_PRIVATE */ static inline void wbinvd(void) @@ -501,7 +503,6 @@ static inline uint64_t rdtscp64(uint32_t *aux) * The implementation is in locore.s. */ extern int rdmsr_carefully(uint32_t msr, uint32_t *lo, uint32_t *hi); - __END_DECLS #endif /* ASSEMBLER */ @@ -538,8 +539,6 @@ __END_DECLS #define MSR_IA32_MPERF 0xE7 #define MSR_IA32_APERF 0xE8 -#define MSR_PMG_CST_CONFIG_CONTROL 0xe2 - #define MSR_IA32_BBL_CR_CTL 0x119 #define MSR_IA32_SYSENTER_CS 0x174 @@ -560,7 +559,6 @@ __END_DECLS #define MSR_IA32_MISC_ENABLE 0x1a0 -#define MSR_IA32_ENERGY_PERFORMANCE_BIAS 0x1b0 #define MSR_IA32_PACKAGE_THERM_STATUS 0x1b1 #define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x1b2 @@ -618,8 +616,20 @@ __END_DECLS #define MSR_IA32_PKG_POWER_SKU_UNIT 0x606 #define MSR_IA32_PKG_C2_RESIDENCY 0x60D #define MSR_IA32_PKG_ENERGY_STATUS 0x611 -#define MSR_IA32_PRIMARY_PLANE_ENERY_STATUS 0x639 -#define MSR_IA32_SECONDARY_PLANE_ENERY_STATUS 0x641 + +#define MSR_IA32_DDR_ENERGY_STATUS 0x619 +#define MSR_IA32_LLC_FLUSHED_RESIDENCY_TIMER 0x61D +#define MSR_IA32_RING_PERF_STATUS 0x621 + +#define MSR_IA32_PKG_C8_RESIDENCY 0x630 +#define MSR_IA32_PKG_C9_RESIDENCY 0x631 +#define MSR_IA32_PKG_C10_RESIDENCY 0x632 + +#define MSR_IA32_PP0_ENERGY_STATUS 0x639 +#define MSR_IA32_PP1_ENERGY_STATUS 0x641 +#define MSR_IA32_IA_PERF_LIMIT_REASONS 0x690 +#define MSR_IA32_GT_PERF_LIMIT_REASONS 0x6B0 + #define MSR_IA32_TSC_DEADLINE 0x6e0 #define MSR_IA32_EFER 0xC0000080 diff --git a/osfmk/i386/rtclock.c b/osfmk/i386/rtclock.c index 8a5f8c667..28354563c 100644 --- a/osfmk/i386/rtclock.c +++ b/osfmk/i386/rtclock.c @@ -91,42 +91,6 @@ rtc_timer_start(void) etimer_resync_deadlines(); } -/* - * tsc_to_nanoseconds: - * - * Basic routine to convert a raw 64 bit TSC value to a - * 64 bit nanosecond value. The conversion is implemented - * based on the scale factor and an implicit 32 bit shift. - */ -static inline uint64_t -_tsc_to_nanoseconds(uint64_t value) -{ -#if defined(__i386__) - asm volatile("movl %%edx,%%esi ;" - "mull %%ecx ;" - "movl %%edx,%%edi ;" - "movl %%esi,%%eax ;" - "mull %%ecx ;" - "addl %%edi,%%eax ;" - "adcl $0,%%edx " - : "+A" (value) - : "c" (pal_rtc_nanotime_info.scale) - : "esi", "edi"); -#elif defined(__x86_64__) - asm volatile("mul %%rcx;" - "shrq $32, %%rax;" - "shlq $32, %%rdx;" - "orq %%rdx, %%rax;" - : "=a"(value) - : "a"(value), "c"(pal_rtc_nanotime_info.scale) - : "rdx", "cc" ); -#else -#error Unsupported architecture -#endif - - return (value); -} - static inline uint32_t _absolutetime_to_microtime(uint64_t abstime, clock_sec_t *secs, clock_usec_t *microsecs) { @@ -251,13 +215,7 @@ rtc_nanotime_init_commpage(void) static inline uint64_t rtc_nanotime_read(void) { - -#if CONFIG_EMBEDDED - if (gPEClockFrequencyInfo.timebase_frequency_hz > SLOW_TSC_THRESHOLD) - return _rtc_nanotime_read(&rtc_nanotime_info, 1); /* slow processor */ - else -#endif - return _rtc_nanotime_read(&pal_rtc_nanotime_info, 0); /* assume fast processor */ + return _rtc_nanotime_read(&pal_rtc_nanotime_info); } /* @@ -277,8 +235,8 @@ rtc_clock_napped(uint64_t base, uint64_t tsc_base) assert(!ml_get_interrupts_enabled()); tsc = rdtsc64(); - oldnsecs = rntp->ns_base + _tsc_to_nanoseconds(tsc - rntp->tsc_base); - newnsecs = base + _tsc_to_nanoseconds(tsc - tsc_base); + oldnsecs = rntp->ns_base + _rtc_tsc_to_nanoseconds(tsc - rntp->tsc_base, rntp); + newnsecs = base + _rtc_tsc_to_nanoseconds(tsc - tsc_base, rntp); /* * Only update the base values if time using the new base values @@ -326,8 +284,8 @@ rtc_clock_stepped(__unused uint32_t new_frequency, * rtc_sleep_wakeup: * * Invoked from power management when we have awoken from a sleep (S3) - * and the TSC has been reset. The nanotime data is updated based on - * the passed in value. + * and the TSC has been reset, or from Deep Idle (S0) sleep when the TSC + * has progressed. The nanotime data is updated based on the passed-in value. * * The caller must guarantee non-reentrancy. */ @@ -377,7 +335,7 @@ rtclock_init(void) rtc_timer_init(); clock_timebase_init(); ml_init_lock_timeout(); - ml_init_delay_spin_threshold(); + ml_init_delay_spin_threshold(10); } /* Set fixed configuration for lapic timers */ @@ -394,14 +352,21 @@ static void rtc_set_timescale(uint64_t cycles) { pal_rtc_nanotime_t *rntp = &pal_rtc_nanotime_info; + uint32_t shift = 0; + + /* the "scale" factor will overflow unless cycles>SLOW_TSC_THRESHOLD */ + + while ( cycles <= SLOW_TSC_THRESHOLD) { + shift++; + cycles <<= 1; + } + + if ( shift != 0 ) + printf("Slow TSC, rtc_nanotime.shift == %d\n", shift); + rntp->scale = (uint32_t)(((uint64_t)NSEC_PER_SEC << 32) / cycles); -#if CONFIG_EMBEDDED - if (cycles <= SLOW_TSC_THRESHOLD) - rntp->shift = (uint32_t)cycles; - else -#endif - rntp->shift = 32; + rntp->shift = shift; if (tsc_rebase_abs_time == 0) tsc_rebase_abs_time = mach_absolute_time(); @@ -602,12 +567,11 @@ nanoseconds_to_absolutetime( void machine_delay_until( - uint64_t deadline) + uint64_t interval, + uint64_t deadline) { - uint64_t now; - - do { - cpu_pause(); - now = mach_absolute_time(); - } while (now < deadline); + (void)interval; + while (mach_absolute_time() < deadline) { + cpu_pause(); + } } diff --git a/osfmk/i386/rtclock_asm_native.h b/osfmk/i386/rtclock_asm_native.h index c17320b7a..528cbfe75 100644 --- a/osfmk/i386/rtclock_asm_native.h +++ b/osfmk/i386/rtclock_asm_native.h @@ -72,6 +72,25 @@ /* * Assembly snippet included in exception handlers and rtc_nanotime_read() + * + * + * Warning! There are several copies of this code in the trampolines found in + * osfmk/x86_64/idt64.s, coming from the various TIMER macros in rtclock_asm.h. + * They're all kept in sync by using the RTC_NANOTIME_READ() macro. + * + * The algorithm we use is: + * + * ns = ((((rdtsc - rnt_tsc_base)< SLOW_TSC_THRESHOLD + * + * Where SLOW_TSC_THRESHOLD is about 10e9. Since most processor's tscFreqs are greater + * than 1GHz, rnt_shift is usually 0. rnt_tsc_scale is also a 32-bit constant: + * + * rnt_tsc_scale = (10e9 * 2**32) / (tscFreq << rnt_shift); + * * %rdi points to nanotime info struct. * %rax returns nanotime */ @@ -83,9 +102,10 @@ rdtsc ; \ lfence ; \ shlq $32,%rdx ; \ + movl RNT_SHIFT(%rdi),%ecx ; \ orq %rdx,%rax /* %rax := tsc */ ; \ subq RNT_TSC_BASE(%rdi),%rax /* tsc - tsc_base */ ; \ - xorq %rcx,%rcx ; \ + shlq %cl,%rax ; \ movl RNT_SCALE(%rdi),%ecx ; \ mulq %rcx /* delta * scale */ ; \ shrdq $32,%rdx,%rax /* %rdx:%rax >>= 32 */ ; \ diff --git a/osfmk/i386/rtclock_protos.h b/osfmk/i386/rtclock_protos.h index 2d944765d..b467df170 100644 --- a/osfmk/i386/rtclock_protos.h +++ b/osfmk/i386/rtclock_protos.h @@ -48,8 +48,11 @@ extern void _rtc_nanotime_adjust( pal_rtc_nanotime_t *dst); extern uint64_t _rtc_nanotime_read( - pal_rtc_nanotime_t *rntp, - int slow); + pal_rtc_nanotime_t *rntp); + +extern uint64_t _rtc_tsc_to_nanoseconds( + uint64_t value, + pal_rtc_nanotime_t *rntp); extern void rtclock_intr(x86_saved_state_t *regs); diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c index d7b460878..c6f921c6f 100644 --- a/osfmk/i386/trap.c +++ b/osfmk/i386/trap.c @@ -103,7 +103,7 @@ #include #include - +#include #include extern void throttle_lowpri_io(int); @@ -350,7 +350,7 @@ interrupt(x86_saved_state_t *state) int ipl; int cnum = cpu_number(); int itype = 0; - + if (is_saved_state64(state) == TRUE) { x86_saved_state64_t *state64; @@ -373,6 +373,9 @@ interrupt(x86_saved_state_t *state) interrupt_num = state32->trapno; } + if (cpu_data_ptr[cnum]->lcpu.package->num_idle == topoParms.nLThreadsPerPackage) + cpu_data_ptr[cnum]->cpu_hwIntpexits[interrupt_num]++; + if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_INTERPROCESSOR_INTERRUPT)) itype = 1; else if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_TIMER_INTERRUPT)) diff --git a/osfmk/i386/tsc.c b/osfmk/i386/tsc.c index 02b41779c..f8f4fd665 100644 --- a/osfmk/i386/tsc.c +++ b/osfmk/i386/tsc.c @@ -165,6 +165,7 @@ tsc_init(void) busFreq = EFI_FSB_frequency(); switch (cpuid_cpufamily()) { + case CPUFAMILY_INTEL_HASWELL: case CPUFAMILY_INTEL_IVYBRIDGE: case CPUFAMILY_INTEL_SANDYBRIDGE: case CPUFAMILY_INTEL_WESTMERE: @@ -211,7 +212,7 @@ tsc_init(void) } kprintf(" BUS: Frequency = %6d.%06dMHz, " - "cvtt2n = %08Xx.%08Xx, cvtn2t = %08Xx.%08Xx\n", + "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X\n", (uint32_t)(busFreq / Mega), (uint32_t)(busFreq % Mega), (uint32_t)(busFCvtt2n >> 32), (uint32_t)busFCvtt2n, @@ -238,7 +239,7 @@ tsc_init(void) tscFCvtn2t = 0xFFFFFFFFFFFFFFFFULL / tscFCvtt2n; kprintf(" TSC: Frequency = %6d.%06dMHz, " - "cvtt2n = %08Xx.%08Xx, cvtn2t = %08Xx.%08Xx, gran = %lld%s\n", + "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X, gran = %lld%s\n", (uint32_t)(tscFreq / Mega), (uint32_t)(tscFreq % Mega), (uint32_t)(tscFCvtt2n >> 32), (uint32_t)tscFCvtt2n, diff --git a/osfmk/i386/tsc.h b/osfmk/i386/tsc.h index 2f6011b93..42bb9684d 100644 --- a/osfmk/i386/tsc.h +++ b/osfmk/i386/tsc.h @@ -42,7 +42,7 @@ #define BASE_NHM_CLOCK_SOURCE 133333333ULL #define IA32_PERF_STS 0x198 -#define SLOW_TSC_THRESHOLD 1000067800 /* TSC is too slow for regular nanotime() algorithm */ +#define SLOW_TSC_THRESHOLD 1000067800 /* if slower, nonzero shift required in nanotime() algorithm */ #ifndef ASSEMBLER extern uint64_t busFCvtt2n; diff --git a/osfmk/kern/clock.c b/osfmk/kern/clock.c index c7aaa6faf..771a0dbef 100644 --- a/osfmk/kern/clock.c +++ b/osfmk/kern/clock.c @@ -827,7 +827,7 @@ _clock_delay_until_deadline( if ( ml_delay_should_spin(interval) || get_preemption_level() != 0 || ml_get_interrupts_enabled() == FALSE ) { - machine_delay_until(deadline); + machine_delay_until(interval, deadline); } else { assert_wait_deadline((event_t)clock_delay_until, THREAD_UNINT, deadline); diff --git a/osfmk/kern/clock.h b/osfmk/kern/clock.h index ed8218d17..fd31a1b9b 100644 --- a/osfmk/kern/clock.h +++ b/osfmk/kern/clock.h @@ -117,7 +117,7 @@ extern void clock_gettimeofday_set_commpage( clock_sec_t *secs, clock_usec_t *microsecs); -extern void machine_delay_until( +extern void machine_delay_until(uint64_t interval, uint64_t deadline); extern uint32_t hz_tick_interval; diff --git a/osfmk/kern/machine.c b/osfmk/kern/machine.c index 9b310f031..c0caa5511 100644 --- a/osfmk/kern/machine.c +++ b/osfmk/kern/machine.c @@ -237,7 +237,7 @@ processor_shutdown( } /* - * Called at splsched. + * Called with interrupts disabled. */ void processor_doshutdown( @@ -245,6 +245,7 @@ processor_doshutdown( { thread_t old_thread, self = current_thread(); processor_t prev; + processor_set_t pset; /* * Get onto the processor to shutdown @@ -252,18 +253,29 @@ processor_doshutdown( prev = thread_bind(processor); thread_block(THREAD_CONTINUE_NULL); -#if HIBERNATION - if (processor_avail_count < 2) - hibernate_vm_lock(); -#endif - assert(processor->state == PROCESSOR_SHUTDOWN); + ml_cpu_down(); + #if HIBERNATION - if (processor_avail_count < 2) + if (processor_avail_count < 2) { + hibernate_vm_lock(); hibernate_vm_unlock(); + } #endif + pset = processor->processor_set; + pset_lock(pset); + processor->state = PROCESSOR_OFF_LINE; + if (--pset->online_processor_count == 0) { + pset_pri_init_hint(pset, PROCESSOR_NULL); + pset_count_init_hint(pset, PROCESSOR_NULL); + } + (void)hw_atomic_sub(&processor_avail_count, 1); + commpage_update_active_cpus(); + SCHED(processor_queue_shutdown)(processor); + /* pset lock dropped */ + /* * Continue processor shutdown in shutdown context. */ @@ -274,7 +286,7 @@ processor_doshutdown( } /* - * Complete the shutdown and place the processor offline. + *Complete the shutdown and place the processor offline. * * Called at splsched in the shutdown context. */ @@ -283,7 +295,6 @@ processor_offline( processor_t processor) { thread_t new_thread, old_thread = processor->active_thread; - processor_set_t pset; new_thread = processor->idle_thread; processor->active_thread = new_thread; @@ -301,20 +312,6 @@ processor_offline( PMAP_DEACTIVATE_KERNEL(processor->cpu_id); - pset = processor->processor_set; - pset_lock(pset); - processor->state = PROCESSOR_OFF_LINE; - if (--pset->online_processor_count == 0) { - pset_pri_init_hint(pset, PROCESSOR_NULL); - pset_count_init_hint(pset, PROCESSOR_NULL); - } - (void)hw_atomic_sub(&processor_avail_count, 1); - commpage_update_active_cpus(); - SCHED(processor_queue_shutdown)(processor); - /* pset lock dropped */ - - ml_cpu_down(); - cpu_sleep(); panic("zombie processor"); /*NOTREACHED*/ diff --git a/osfmk/kern/processor.c b/osfmk/kern/processor.c index 23a549611..ead95b882 100644 --- a/osfmk/kern/processor.c +++ b/osfmk/kern/processor.c @@ -143,6 +143,8 @@ processor_init( int cpu_id, processor_set_t pset) { + spl_t s; + if (processor != master_processor) { /* Scheduler state deferred until sched_init() */ SCHED(processor_init)(processor); @@ -162,6 +164,7 @@ processor_init( processor_data_init(processor); processor->processor_list = NULL; + s = splsched(); pset_lock(pset); if (pset->cpu_set_count++ == 0) pset->cpu_set_low = pset->cpu_set_hi = cpu_id; @@ -170,6 +173,7 @@ processor_init( pset->cpu_set_hi = (cpu_id > pset->cpu_set_hi)? cpu_id: pset->cpu_set_hi; } pset_unlock(pset); + splx(s); simple_lock(&processor_list_lock); if (processor_list == NULL) diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c index b20629ffa..e17631376 100644 --- a/osfmk/kern/startup.c +++ b/osfmk/kern/startup.c @@ -118,6 +118,7 @@ #include #endif +#include static void kernel_bootstrap_thread(void); static void load_context( @@ -142,6 +143,7 @@ extern int serverperfmode; /* size of kernel trace buffer, disabled by default */ unsigned int new_nkdbufs = 0; +unsigned int wake_nkdbufs = 0; /* mach leak logging */ int log_leaks = 0; @@ -177,6 +179,8 @@ kernel_bootstrap(void) PE_parse_boot_argn("trace", &new_nkdbufs, sizeof (new_nkdbufs)); + PE_parse_boot_argn("trace_wake", &wake_nkdbufs, sizeof (wake_nkdbufs)); + /* i386_vm_init already checks for this ; do it aagin anyway */ if (PE_parse_boot_argn("serverperfmode", &serverperfmode, sizeof (serverperfmode))) { serverperfmode = 1; @@ -341,7 +345,7 @@ kernel_bootstrap_thread(void) #if (defined(__i386__) || defined(__x86_64__)) if (turn_on_log_leaks && !new_nkdbufs) new_nkdbufs = 200000; - start_kern_tracing(new_nkdbufs); + start_kern_tracing(new_nkdbufs, FALSE); if (turn_on_log_leaks) log_leaks = 1; #endif @@ -366,7 +370,7 @@ kernel_bootstrap_thread(void) #if (!defined(__i386__) && !defined(__x86_64__)) if (turn_on_log_leaks && !new_nkdbufs) new_nkdbufs = 200000; - start_kern_tracing(new_nkdbufs); + start_kern_tracing(new_nkdbufs, FALSE); if (turn_on_log_leaks) log_leaks = 1; #endif diff --git a/osfmk/mach/branch_predicates.h b/osfmk/mach/branch_predicates.h index 8d16db0fa..ab32e87a7 100644 --- a/osfmk/mach/branch_predicates.h +++ b/osfmk/mach/branch_predicates.h @@ -30,6 +30,6 @@ #ifndef _MACH_BRANCH_PREDICATES_H #define _MACH_BRANCH_PREDICATES_H -#define __probable(x) __builtin_expect((x), 1) -#define __improbable(x) __builtin_expect((x), 0) +#define __probable(x) __builtin_expect((long)(x), 1L) +#define __improbable(x) __builtin_expect((long)(x), 0L) #endif /* _MACH_BRANCH_PREDICATES_H */ diff --git a/osfmk/mach/i386/thread_status.h b/osfmk/mach/i386/thread_status.h index 715422ac8..36232c736 100644 --- a/osfmk/mach/i386/thread_status.h +++ b/osfmk/mach/i386/thread_status.h @@ -110,9 +110,10 @@ #define x86_DEBUG_STATE64 11 #define x86_DEBUG_STATE 12 #define THREAD_STATE_NONE 13 -/* 15 and 16 are used for the internal x86_SAVED_STATE flavours */ +/* 14 and 15 are used for the internal x86_SAVED_STATE flavours */ #define x86_AVX_STATE32 16 #define x86_AVX_STATE64 17 +#define x86_AVX_STATE 18 /* @@ -142,6 +143,7 @@ (x == x86_DEBUG_STATE) || \ (x == x86_AVX_STATE32) || \ (x == x86_AVX_STATE64) || \ + (x == x86_AVX_STATE) || \ (x == THREAD_STATE_NONE)) struct x86_state_hdr { @@ -263,6 +265,14 @@ struct x86_debug_state { } uds; }; +struct x86_avx_state { + x86_state_hdr_t ash; + union { + x86_avx_state32_t as32; + x86_avx_state64_t as64; + } ufs; +}; + typedef struct x86_thread_state x86_thread_state_t; #define x86_THREAD_STATE_COUNT ((mach_msg_type_number_t) \ ( sizeof (x86_thread_state_t) / sizeof (int) )) @@ -279,6 +289,10 @@ typedef struct x86_debug_state x86_debug_state_t; #define x86_DEBUG_STATE_COUNT ((mach_msg_type_number_t) \ (sizeof(x86_debug_state_t)/sizeof(unsigned int))) +typedef struct x86_avx_state x86_avx_state_t; +#define x86_AVX_STATE_COUNT ((mach_msg_type_number_t) \ + (sizeof(x86_avx_state_t)/sizeof(unsigned int))) + /* * Machine-independent way for servers and Mach's exception mechanism to * choose the most efficient state flavor for exception RPC's: diff --git a/osfmk/mach/mach_types.defs b/osfmk/mach/mach_types.defs index a013ff1ae..e890f6652 100644 --- a/osfmk/mach/mach_types.defs +++ b/osfmk/mach/mach_types.defs @@ -217,6 +217,7 @@ type thread_policy_t = array[*:16] of integer_t; * task_extmod_info_t (8 64-bit ints) * task_basic_info_64_2_t * mach_task_basic_info_t (12 ints) + * task_power_info_t (18 ints) * If other task_info flavors are added, this * definition may need to be changed. (See * mach/task_info.h and mach/policy.h) */ @@ -303,12 +304,13 @@ type host_security_t = mach_port_t * kernel_resource_sizes_t (5 ints) * host_load_info_t (6 ints) * vm_statistics32_t (15 ints) + * host_expired_task_info uses a task_power_info (18 ints) * * If other host_info flavors are added, this definition may * need to be changed. (See mach/{host_info,vm_statistics}.h) */ type host_flavor_t = int; -type host_info_t = array[*:15] of integer_t; +type host_info_t = array[*:18] of integer_t; /* diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index 57a0c28e6..1520a6049 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -378,6 +378,7 @@ __END_DECLS #define CPUFAMILY_INTEL_WESTMERE 0x573b5eec #define CPUFAMILY_INTEL_SANDYBRIDGE 0x5490b78c #define CPUFAMILY_INTEL_IVYBRIDGE 0x1f65e835 +#define CPUFAMILY_INTEL_HASWELL 0x10b282dc #define CPUFAMILY_ARM_9 0xe73283ae #define CPUFAMILY_ARM_11 0x8ff620d8 #define CPUFAMILY_ARM_XSCALE 0x53b005f5 diff --git a/osfmk/mach/thread_status.h b/osfmk/mach/thread_status.h index aead09bf9..cef4380bf 100644 --- a/osfmk/mach/thread_status.h +++ b/osfmk/mach/thread_status.h @@ -87,6 +87,7 @@ typedef natural_t thread_state_data_t[THREAD_STATE_MAX]; #define THREAD_STATE_FLAVOR_LIST 0 /* List of valid flavors */ #define THREAD_STATE_FLAVOR_LIST_NEW 128 +#define THREAD_STATE_FLAVOR_LIST_10_9 129 typedef int thread_state_flavor_t; typedef thread_state_flavor_t *thread_state_flavor_array_t; diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c index e35f70daf..0629a9900 100644 --- a/osfmk/vm/vm_kern.c +++ b/osfmk/vm/vm_kern.c @@ -233,7 +233,7 @@ kernel_memory_allocate( vm_object_t object; vm_object_offset_t offset; vm_object_offset_t pg_offset; - vm_map_entry_t entry; + vm_map_entry_t entry = NULL; vm_map_offset_t map_addr, fill_start; vm_map_offset_t map_mask; vm_map_size_t map_size, fill_size; diff --git a/osfmk/x86_64/Makefile b/osfmk/x86_64/Makefile index 354a2f3db..8e7e0cc88 100644 --- a/osfmk/x86_64/Makefile +++ b/osfmk/x86_64/Makefile @@ -3,21 +3,9 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) -EXPORT_ONLY_FILES = - -INSTALL_MD_DIR = x86_64 - -INSTALL_MD_LIST = - -INSTALL_MD_LCL_LIST = - -EXPORT_MD_LIST = ${EXPORT_ONLY_FILES} - -EXPORT_MD_DIR = x86_64 include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/osfmk/x86_64/locore.s b/osfmk/x86_64/locore.s index f13db5aab..8ca0c92a4 100644 --- a/osfmk/x86_64/locore.s +++ b/osfmk/x86_64/locore.s @@ -128,6 +128,40 @@ rdmsr_fail: movq $1, %rax ret +/* + * int rdmsr64_carefully(uint32_t msr, uint64_t *val); + */ + +ENTRY(rdmsr64_carefully) + movl %edi, %ecx + RECOVERY_SECTION + RECOVER(rdmsr64_carefully_fail) + rdmsr + movl %eax, (%rsi) + movl %edx, 4(%rsi) + xorl %eax, %eax + ret +rdmsr64_carefully_fail: + movl $1, %eax + ret +/* + * int wrmsr64_carefully(uint32_t msr, uint64_t val); + */ + +ENTRY(wrmsr_carefully) + movl %edi, %ecx + movl %esi, %eax + shr $32, %rsi + movl %esi, %edx + RECOVERY_SECTION + RECOVER(wrmsr_fail) + wrmsr + xorl %eax, %eax + ret +wrmsr_fail: + movl $1, %eax + ret + .globl EXT(thread_exception_return) .globl EXT(thread_bootstrap_return) LEXT(thread_bootstrap_return) diff --git a/osfmk/x86_64/machine_routines_asm.s b/osfmk/x86_64/machine_routines_asm.s index 362887586..1d2e3ed3f 100644 --- a/osfmk/x86_64/machine_routines_asm.s +++ b/osfmk/x86_64/machine_routines_asm.s @@ -104,28 +104,36 @@ ENTRY(_rtc_nanotime_adjust) ret /* - * unint64_t _rtc_nanotime_read(rtc_nanotime_t *rntp, int slow); + * uint64_t _rtc_nanotime_read(rtc_nanotime_t *rntp); * * This is the same as the commpage nanotime routine, except that it uses the * kernel internal "rtc_nanotime_info" data instead of the commpage data. * These two copies of data are kept in sync by rtc_clock_napped(). * - * Warning! There is another copy of this code in osfmk/x86_64/idt64.s. - * These are kept in sync by both using the RTC_NANOTIME_READ() macro. + * Warning! There are several copies of this code in the trampolines found in + * osfmk/x86_64/idt64.s, coming from the various TIMER macros in rtclock_asm.h. + * They're all kept in sync by using the RTC_NANOTIME_READ() macro. * - * There are two versions of this algorithm, for "slow" and "fast" processors. - * The more common "fast" algorithm is: + * The algorithm we use is: * - * ns = (((rdtsc - rnt_tsc_base)*rnt_tsc_scale) / 2**32) + rnt_ns_base; + * ns = ((((rdtsc - rnt_tsc_base)< SLOW_TSC_THRESHOLD * - * The "slow" algorithm uses long division: + * Where SLOW_TSC_THRESHOLD is about 10e9. Since most processor's tscFreqs are greater + * than 1GHz, rnt_shift is usually 0. rnt_tsc_scale is also a 32-bit constant: * - * ns = (((rdtsc - rnt_tsc_base) * 10e9) / tscFreq) + rnt_ns_base; + * rnt_tsc_scale = (10e9 * 2**32) / (tscFreq << rnt_shift); + * + * On 64-bit processors this algorithm could be simplified by doing a 64x64 bit + * multiply of rdtsc by tscFCvtt2n: + * + * ns = (((rdtsc - rnt_tsc_base) * tscFCvtt2n) / 2**32) + rnt_ns_base; + * + * We don't do so in order to use the same algorithm in 32- and 64-bit mode. + * When U32 goes away, we should reconsider. * * Since this routine is not synchronized and can be called in any context, * we use a generation count to guard against seeing partially updated data. @@ -136,33 +144,36 @@ ENTRY(_rtc_nanotime_adjust) * the generation is zero. * * unint64_t _rtc_nanotime_read( - * rtc_nanotime_t *rntp, // %rdi - * int slow); // %rsi + * rtc_nanotime_t *rntp); // %rdi * */ ENTRY(_rtc_nanotime_read) - test %rsi,%rsi - jnz Lslow - - /* - * Processor whose TSC frequency is faster than SLOW_TSC_THRESHOLD - */ + PAL_RTC_NANOTIME_READ_FAST() ret + +/* + * extern uint64_t _rtc_tsc_to_nanoseconds( + * uint64_t value, // %rdi + * pal_rtc_nanotime_t *rntp); // %rsi + * + * Converts TSC units to nanoseconds, using an abbreviated form of the above + * algorithm. Note that while we could have simply used tmrCvt(value,tscFCvtt2n), + * which would avoid the need for this asm, doing so is a bit more risky since + * we'd be using a different algorithm with possibly different rounding etc. + */ - /* - * Processor whose TSC frequency is not faster than SLOW_TSC_THRESHOLD - * But K64 doesn't support this... - */ -Lslow: - lea 1f(%rip),%rdi - xorb %al,%al - call EXT(panic) - hlt - .data -1: String "_rtc_nanotime_read() - slow algorithm not supported" - .text +ENTRY(_rtc_tsc_to_nanoseconds) + movq %rdi,%rax /* copy value (in TSC units) to convert */ + movl RNT_SHIFT(%rsi),%ecx + movl RNT_SCALE(%rsi),%edx + shlq %cl,%rax /* tscUnits << shift */ + mulq %rdx /* (tscUnits << shift) * scale */ + shrdq $32,%rdx,%rax /* %rdx:%rax >>= 32 */ + ret + + Entry(call_continuation) movq %rdi,%rcx /* get continuation */ diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index 2bc5bfab7..3f05e7f27 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -295,6 +295,7 @@ boolean_t pmap_smep_enabled = FALSE; void pmap_cpu_init(void) { + cpu_data_t *cdp = current_cpu_datap(); /* * Here early in the life of a processor (from cpu_mode_init()). * Ensure global page feature is disabled at this point. @@ -305,10 +306,10 @@ pmap_cpu_init(void) /* * Initialize the per-cpu, TLB-related fields. */ - current_cpu_datap()->cpu_kernel_cr3 = kernel_pmap->pm_cr3; - current_cpu_datap()->cpu_active_cr3 = kernel_pmap->pm_cr3; - current_cpu_datap()->cpu_tlb_invalid = FALSE; - current_cpu_datap()->cpu_task_map = TASK_MAP_64BIT; + cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3; + cdp->cpu_active_cr3 = kernel_pmap->pm_cr3; + cdp->cpu_tlb_invalid = FALSE; + cdp->cpu_task_map = TASK_MAP_64BIT; pmap_pcid_configure(); if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) { boolean_t nsmep; @@ -317,6 +318,11 @@ pmap_cpu_init(void) pmap_smep_enabled = TRUE; } } + + if (cdp->cpu_fixed_pmcs_enabled) { + boolean_t enable = TRUE; + cpu_pmc_control(&enable); + } } diff --git a/pexpert/i386/pe_serial.c b/pexpert/i386/pe_serial.c index fcff88b88..f056487a0 100644 --- a/pexpert/i386/pe_serial.c +++ b/pexpert/i386/pe_serial.c @@ -201,7 +201,6 @@ int serial_init( void ) void serial_putc( char c ) { uart_putc(c); - if (c == '\n') uart_putc('\r'); } int serial_getc( void ) -- 2.45.2