From 4a3eedf9ecc9bbe3f3a5c6ce5e53ad199d639d32 Mon Sep 17 00:00:00 2001 From: Apple Date: Tue, 12 Feb 2008 10:17:42 +0000 Subject: [PATCH] xnu-1228.3.13.tar.gz --- README | 24 +- bsd/conf/MASTER.i386 | 5 +- bsd/conf/MASTER.ppc | 3 +- bsd/conf/files | 1 - bsd/dev/dtrace/dtrace_glue.c | 18 +- bsd/dev/dtrace/lockstat.c | 2 +- bsd/dev/memdev.c | 22 ++ bsd/dev/random/randomdev.c | 43 +++- bsd/hfs/hfs_catalog.c | 25 +- bsd/hfs/hfs_cnode.c | 64 ++++- bsd/hfs/hfs_link.c | 5 +- bsd/hfs/hfs_vfsops.c | 11 +- bsd/hfs/hfs_vnops.c | 76 ++++-- bsd/hfs/hfs_xattr.c | 2 + bsd/kern/bsd_init.c | 3 + bsd/kern/kern_exec.c | 5 +- bsd/kern/kern_exit.c | 4 +- bsd/kern/kern_sysctl.c | 15 +- bsd/kern/kpi_socketfilter.c | 40 ++- bsd/kern/pthread_synch.c | 5 +- bsd/kern/sys_generic.c | 5 +- bsd/kern/uipc_socket.c | 27 +- bsd/kern/uipc_socket2.c | 1 + bsd/kern/uipc_syscalls.c | 31 ++- bsd/net/dlil.c | 26 ++ bsd/net/dlil.h | 1 + bsd/net/if.c | 5 +- bsd/net/kext_net.h | 5 +- bsd/net/kpi_interface.c | 75 +++--- bsd/net/kpi_interface.h | 19 ++ bsd/net/kpi_protocol.c | 1 + bsd/net/route.c | 2 +- bsd/netinet/igmp.c | 12 +- bsd/netinet/in.h | 2 +- bsd/netinet/ip_fw2.h | 3 + bsd/netinet/ip_input.c | 31 ++- bsd/netinet/ip_output.c | 10 +- bsd/netinet/raw_ip.c | 2 + bsd/netinet/tcp_input.c | 77 +++--- bsd/netinet/tcp_output.c | 36 ++- bsd/netinet/tcp_timer.c | 6 +- bsd/netinet6/nd6.c | 8 +- bsd/nfs/nfs_socket.c | 2 + bsd/nfs/nfs_syscalls.c | 2 + bsd/nfs/nfsm_subs.h | 16 +- bsd/sys/aio.h | 2 +- bsd/sys/dtrace_glue.h | 1 + bsd/sys/errno.h | 1 + bsd/sys/namei.h | 2 +- bsd/sys/reboot.h | 1 + bsd/sys/socket.h | 1 + bsd/sys/socketvar.h | 1 + bsd/vfs/vfs_cache.c | 91 ++++--- bsd/vfs/vfs_journal.c | 6 +- bsd/vfs/vfs_lookup.c | 29 ++- bsd/vfs/vfs_subr.c | 26 +- bsd/vfs/vfs_xattr.c | 2 +- bsd/vm/vm_unix.c | 4 +- config/BSDKernel.exports | 1 + config/IOKit.exports | 4 +- config/MasterVersion | 2 +- config/System6.0.exports | 2 - iokit/IOKit/pwr_mgt/IOPM.h | 82 ++++++ iokit/IOKit/pwr_mgt/RootDomain.h | 19 ++ iokit/Kernel/IOBufferMemoryDescriptor.cpp | 4 +- iokit/Kernel/IODMACommand.cpp | 4 +- iokit/Kernel/IODeviceTreeSupport.cpp | 27 +- iokit/Kernel/IOHibernateIO.cpp | 5 +- iokit/Kernel/IOKitKernelInternal.h | 2 +- iokit/Kernel/IOMemoryDescriptor.cpp | 2 + iokit/Kernel/IOPMrootDomain.cpp | 80 +++++- iokit/Kernel/IOPlatformExpert.cpp | 67 +++-- iokit/bsddev/IOKitBSDInit.cpp | 10 +- iokit/conf/MASTER | 1 - iokit/conf/files | 1 - kgmacros | 166 +++++++++++++ libsyscall/Makefile | 2 +- libsyscall/Makefile.xbs | 10 +- libsyscall/create-syscalls.pl | 4 +- libsyscall/custom/SYS.h | 6 +- libsyscall/mach/Makefile.inc | 9 +- makedefs/MakeInc.def | 50 +--- osfmk/conf/MASTER.i386 | 4 + osfmk/conf/MASTER.ppc | 4 + osfmk/i386/AT386/model_dep.c | 2 + osfmk/i386/acpi.c | 6 +- osfmk/i386/hpet.c | 27 +- osfmk/i386/machine_check.c | 11 +- osfmk/i386/misc_protos.h | 2 +- osfmk/i386/mp_desc.c | 5 + osfmk/i386/pmap.c | 4 + osfmk/i386/rtclock.c | 55 ++-- osfmk/i386/startup64.c | 2 + osfmk/i386/thread.h | 2 +- osfmk/i386/tsc.c | 13 +- osfmk/kern/etimer.h | 2 - osfmk/kern/locks.c | 27 +- osfmk/kern/mach_clock.c | 9 +- osfmk/kern/priority.c | 3 +- osfmk/kern/sched_prim.c | 81 +++--- osfmk/kern/syscall_subr.c | 2 - osfmk/kern/thread.h | 13 +- osfmk/kern/thread_act.c | 1 - osfmk/mach/i386/thread_status.h | 2 +- osfmk/mach/machine.h | 3 + osfmk/vm/bsd_vm.c | 14 +- osfmk/vm/vm_fault.c | 97 ++++++-- osfmk/vm/vm_map.c | 41 ++- osfmk/vm/vm_object.c | 2 +- osfmk/vm/vm_page.h | 10 +- osfmk/vm/vm_pageout.c | 289 +++++++++++++++++++--- osfmk/vm/vm_purgeable.c | 33 ++- osfmk/vm/vm_purgeable_internal.h | 9 +- osfmk/vm/vm_resident.c | 15 +- osfmk/vm/vm_shared_region.c | 183 ++++++++++---- security/conf/MASTER | 2 +- security/conf/MASTER.i386 | 7 +- security/conf/MASTER.ppc | 6 +- security/conf/Makefile.template | 2 +- security/conf/files | 6 + security/mac_audit.c | 8 +- security/mac_base.c | 4 + tools/tests/xnu_quick_test/tests.c | 2 +- 123 files changed, 1940 insertions(+), 575 deletions(-) diff --git a/README b/README index 9ab5b012d..76ea08c38 100644 --- a/README +++ b/README @@ -15,31 +15,27 @@ A. How to build XNU: By default, architecture defaults to the build machine architecture, and the kernel configuration is set to build for DEVELOPMENT. - The machine configuration defaults to MX31ADS for arm and nothing for i386 and ppc. + The machine configuration defaults to S5L8900XRB for arm and default for i386 and ppc. This will also create a bootable image, mach_kernel, and a kernel binary with symbols, mach_kernel.sys. - - Here are the valid arm machine configs: - LN2410SBC MX31ADS INTEGRATORCP S5I3000SMDK S5L8900XFPGA S5L8900XRB - OLOCREEK Examples: - /* make a debug kernel for MX31 arm board */ - make TARGET_CONFIGS="debug arm MX31ADS" + /* make a debug kernel for H1 arm board */ + make TARGET_CONFIGS="debug arm s5l8900xrb" - $(OBJROOT)/DEBUG_ARM_MX31ADS/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEBUG_ARM_MX31ADS/mach_kernel: bootable image + $(OBJROOT)/DEBUG_ARM_S5L8900XRB/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component + $(OBJROOT)/DEBUG_ARM_S5L8900XRB/mach_kernel: bootable image - /* make debug and development kernels for MX31 arm board */ - make TARGET_CONFIGS="debug arm MX31ADS development arm MX31ADS" + /* make debug and development kernels for H1 arm board */ + make TARGET_CONFIGS="debug arm s5l8900xrb development arm s5l8900xrb" - $(OBJROOT)/DEBUG_ARM_MX31ADS/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEBUG_ARM_MX31ADS/mach_kernel: bootable image + $(OBJROOT)/DEBUG_ARM_S5L8900XRB/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component + $(OBJROOT)/DEBUG_ARM_S5L8900XRB/mach_kernel: bootable image $(OBJROOT)/DEVELOPMENT_ARM/osfmk/DEVELOPMENT/osfmk.o: pre-linked object for osfmk component $(OBJROOT)/DEVELOPMENT_ARM/mach_kernel: bootable image - /* this is all you need to do to build MX31ADS arm with DEVELOPMENT kernel configuration */ + /* this is all you need to do to build H1 arm with DEVELOPMENT kernel configuration */ make TARGET_CONFIGS="default arm default" or the following is equivalent diff --git a/bsd/conf/MASTER.i386 b/bsd/conf/MASTER.i386 index 24125e3ce..a4504b8a8 100644 --- a/bsd/conf/MASTER.i386 +++ b/bsd/conf/MASTER.i386 @@ -55,7 +55,7 @@ # # EMBEDDED_BASE = [ intel mach bsmall vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug compat_43_tty sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot ] # EMBEDDED_FILESYS = [ devfs hfs journaling fdesc fifo ] -# EMBEDDED_NET = [ inet compat_oldsock mrouting tcpdrop_synfin bpfilter ipdivert config_mbuf_noexpand dummynet ipfirewall ipfw2 zlib ifnet_input_chk ] +# EMBEDDED_NET = [ inet compat_oldsock mrouting tcpdrop_synfin bpfilter config_mbuf_noexpand ] # EMBEDDED = [ EMBEDDED_BASE EMBEDDED_NET VPN EMBEDDED_FILESYS libdriver no_printf_str no_kprintf_str no_kdebug ] # DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver netmibs development mach_assert config_dtrace ] # @@ -79,7 +79,8 @@ config mach_kernel swap generic # options EVENT # # -# Note: MAC options must be set in both bsd/conf and security/conf MASTER files +# Note: MAC options must be set in all the bsd/conf, osfmk/conf, and +# security/conf MASTER files. # options CONFIG_MACF # Mandatory Access Control Framework options CONFIG_MACF_SOCKET_SUBSET # MAC socket subest (no labels) diff --git a/bsd/conf/MASTER.ppc b/bsd/conf/MASTER.ppc index 4e1513cad..9f4a08d6d 100644 --- a/bsd/conf/MASTER.ppc +++ b/bsd/conf/MASTER.ppc @@ -69,7 +69,8 @@ options UXPR # user-level XPR package # config mach_kernel swap generic # # -# Note: MAC options must be set in both bsd/conf and security/conf MASTER files +# Note: MAC options must be set in all the bsd/conf, osfmk/conf, and +# security/conf MASTER files. # options CONFIG_MACF # Mandatory Access Control Framework options CONFIG_MACF_SOCKET_SUBSET # MAC socket subest (no labels) diff --git a/bsd/conf/files b/bsd/conf/files index 502307c92..4f927bcba 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -67,7 +67,6 @@ OPTIONS/vndevice optional vndevice OPTIONS/audit optional audit OPTIONS/config_fse optional config_fse OPTIONS/sockets optional sockets -OPTIONS/kpidirect optional kpidirect OPTIONS/development optional development OPTIONS/sysv_sem optional sysv_sem OPTIONS/sysv_msg optional sysv_msg diff --git a/bsd/dev/dtrace/dtrace_glue.c b/bsd/dev/dtrace/dtrace_glue.c index 035150aa7..1ef883569 100644 --- a/bsd/dev/dtrace/dtrace_glue.c +++ b/bsd/dev/dtrace/dtrace_glue.c @@ -1218,7 +1218,16 @@ dtrace_copyinstr(user_addr_t src, uintptr_t dst, size_t len) size_t actual; if (dtrace_copycheck( src, dst, len )) { - if (copyinstr((const user_addr_t)src, (char *)dst, (vm_size_t)len, &actual)) { + /* copyin as many as 'len' bytes. */ + int error = copyinstr((const user_addr_t)src, (char *)dst, (vm_size_t)len, &actual); + + /* + * ENAMETOOLONG is returned when 'len' bytes have been copied in but the NUL terminator was + * not encountered. That does not require raising CPU_DTRACE_BADADDR, and we press on. + * Note that we do *not* stuff a NUL terminator when returning ENAMETOOLONG, that's left + * to the caller. + */ + if (error && error != ENAMETOOLONG) { DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); cpu_core[CPU->cpu_id].cpuc_dtrace_illval = src; } @@ -1244,6 +1253,13 @@ dtrace_copyoutstr(uintptr_t src, user_addr_t dst, size_t len) size_t actual; if (dtrace_copycheck( dst, src, len )) { + + /* + * ENAMETOOLONG is returned when 'len' bytes have been copied out but the NUL terminator was + * not encountered. We raise CPU_DTRACE_BADADDR in that case. + * Note that we do *not* stuff a NUL terminator when returning ENAMETOOLONG, that's left + * to the caller. + */ if (copyoutstr((const void *)src, dst, (size_t)len, &actual)) { DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); cpu_core[CPU->cpu_id].cpuc_dtrace_illval = dst; diff --git a/bsd/dev/dtrace/lockstat.c b/bsd/dev/dtrace/lockstat.c index f466c873e..3c5602be9 100644 --- a/bsd/dev/dtrace/lockstat.c +++ b/bsd/dev/dtrace/lockstat.c @@ -77,7 +77,7 @@ typedef struct lockstat_probe { lockstat_probe_t lockstat_probes[] = { -#ifndef __PPC__ +#ifdef __i386__ /* Not implemented yet on PPC... */ { LS_LCK_MTX_LOCK, LSA_ACQUIRE, LS_LCK_MTX_LOCK_ACQUIRE, DTRACE_IDNONE }, { LS_LCK_MTX_LOCK, LSA_SPIN, LS_LCK_MTX_LOCK_SPIN, DTRACE_IDNONE }, diff --git a/bsd/dev/memdev.c b/bsd/dev/memdev.c index 307ad77d9..f957be33c 100644 --- a/bsd/dev/memdev.c +++ b/bsd/dev/memdev.c @@ -172,6 +172,7 @@ int mdevCMajor = -1; static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, int is_char); dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys); dev_t mdevlookup(int devid); +void mdevremoveall(void); static int mdevclose(__unused dev_t dev, __unused int flags, __unused int devtype, __unused struct proc *p) { @@ -609,3 +610,24 @@ dev_t mdevlookup(int devid) { if(!(mdev[devid].mdFlags & mdInited)) return -1; /* This one hasn't been defined */ return mdev[devid].mdBDev; /* Return the device number */ } + +void mdevremoveall(void) { + + int i; + + for(i = 0; i < 16; i++) { + if(!(mdev[i].mdFlags & mdInited)) continue; /* Ignore unused mdevs */ + + devfs_remove(mdev[i].mdbdevb); /* Remove the block device */ + devfs_remove(mdev[i].mdcdevb); /* Remove the character device */ + + mdev[i].mdBase = 0; /* Clear the mdev's storage */ + mdev[i].mdSize = 0; + mdev[i].mdSecsize = 0; + mdev[i].mdFlags = 0; + mdev[i].mdBDev = 0; + mdev[i].mdCDev = 0; + mdev[i].mdbdevb = 0; + mdev[i].mdcdevb = 0; + } +} diff --git a/bsd/dev/random/randomdev.c b/bsd/dev/random/randomdev.c index 4a7741e2a..9208ff6b6 100644 --- a/bsd/dev/random/randomdev.c +++ b/bsd/dev/random/randomdev.c @@ -99,7 +99,7 @@ typedef BlockWord Block[kBSize]; void add_blocks(Block a, Block b, BlockWord carry); void fips_initialize(void); -void random_block(Block b); +void random_block(Block b, int addOptional); u_int32_t CalculateCRC(u_int8_t* buffer, size_t length); /* @@ -194,18 +194,22 @@ u_int32_t CalculateCRC(u_int8_t* buffer, size_t length) * get a random block of data per fips 186-2 */ void -random_block(Block b) +random_block(Block b, int addOptional) { int repeatCount = 0; do { // do one iteration - Block xSeed; - prngOutput (gPrngRef, (BYTE*) &xSeed, sizeof (xSeed)); - // add the seed to the previous value of g_xkey - add_blocks (g_xkey, xSeed, 0); - + if (addOptional) + { + Block xSeed; + prngOutput (gPrngRef, (BYTE*) &xSeed, sizeof (xSeed)); + + // add the seed to the previous value of g_xkey + add_blocks (g_xkey, xSeed, 0); + } + // compute "G" SHA1Update (&g_sha1_ctx, (const u_int8_t *) &g_xkey, sizeof (g_xkey)); @@ -309,11 +313,13 @@ PreliminarySetup(void) fips_initialize (); } +const Block kKnownAnswer = {0x92b404e5, 0x56588ced, 0x6c1acd4e, 0xbf053f68, 0x9f73a93}; + void fips_initialize(void) { - /* Read the initial value of g_xkey from yarrow */ - prngOutput (gPrngRef, (BYTE*) &g_xkey, sizeof (g_xkey)); + /* So that we can do the self test, set the seed to zero */ + memset(&g_xkey, 0, sizeof(g_xkey)); /* initialize our SHA1 generator */ SHA1Init (&g_sha1_ctx); @@ -321,7 +327,20 @@ fips_initialize(void) /* other initializations */ memset (zeros, 0, sizeof (zeros)); g_bytes_used = 0; - random_block(g_random_data); + random_block(g_random_data, FALSE); + + // check here to see if we got the initial data we were expecting + int i; + for (i = 0; i < kBSize; ++i) + { + if (kKnownAnswer[i] != g_random_data[i]) + { + panic("FIPS random self test failed"); + } + } + + // now do the random block again to make sure that userland doesn't get predicatable data + random_block(g_random_data, TRUE); } /* @@ -490,7 +509,7 @@ random_read(__unused dev_t dev, struct uio *uio, __unused int ioflag) int bytes_available = kBSizeInBytes - g_bytes_used; if (bytes_available == 0) { - random_block(g_random_data); + random_block(g_random_data, TRUE); g_bytes_used = 0; bytes_available = kBSizeInBytes; } @@ -533,7 +552,7 @@ read_random(void* buffer, u_int numbytes) int bytes_to_read = min(bytes_remaining, kBSizeInBytes - g_bytes_used); if (bytes_to_read == 0) { - random_block(g_random_data); + random_block(g_random_data, TRUE); g_bytes_used = 0; bytes_to_read = min(bytes_remaining, kBSizeInBytes); } diff --git a/bsd/hfs/hfs_catalog.c b/bsd/hfs/hfs_catalog.c index 0a4953cbd..b52a0cd22 100644 --- a/bsd/hfs/hfs_catalog.c +++ b/bsd/hfs/hfs_catalog.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -226,6 +226,11 @@ cat_convertattr( } } +/* + * Convert a raw catalog key and record into an in-core catalog descriptor. + * + * Note: The caller is responsible for releasing the catalog descriptor. + */ __private_extern__ int cat_convertkey( @@ -286,6 +291,9 @@ cat_releasedesc(struct cat_desc *descp) /* * cat_lookup - lookup a catalog node using a cnode decriptor + * + * Note: The caller is responsible for releasing the output + * catalog descriptor (when supplied outdescp is non-null). */ __private_extern__ int @@ -394,6 +402,10 @@ exit: * cat_findname - obtain a descriptor from cnid * * Only a thread lookup is performed. + * + * Note: The caller is responsible for releasing the output + * catalog descriptor (when supplied outdescp is non-null). + */ __private_extern__ int @@ -464,6 +476,9 @@ exit: /* * cat_idlookup - lookup a catalog node using a cnode id + * + * Note: The caller is responsible for releasing the output + * catalog descriptor (when supplied outdescp is non-null). */ __private_extern__ int @@ -765,6 +780,9 @@ exit: * * NOTE: both the catalog file and attribute file locks must * be held before calling this function. + * + * The caller is responsible for releasing the output + * catalog descriptor (when supplied outdescp is non-null). */ __private_extern__ int @@ -937,6 +955,9 @@ exit: * 3. BTDeleteRecord(from_cnode); * 4. BTDeleteRecord(from_thread); * 5. BTInsertRecord(to_thread); + * + * Note: The caller is responsible for releasing the output + * catalog descriptor (when supplied out_cdp is non-null). */ __private_extern__ int @@ -1690,6 +1711,7 @@ cat_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid) if (retval) { hfs_systemfile_unlock(hfsmp, lockflags); hfs_end_transaction(hfsmp); + cat_releasedesc(&desc); break; } @@ -1697,6 +1719,7 @@ cat_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid) hfs_end_transaction(hfsmp); cnid = desc.cd_parentcnid; + cat_releasedesc(&desc); } return retval; diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c index 1f434da3d..c09c058c6 100644 --- a/bsd/hfs/hfs_cnode.c +++ b/bsd/hfs/hfs_cnode.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2007 Apple Inc. All rights reserved. + * Copyright (c) 2002-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,6 +105,14 @@ hfs_vnop_inactive(struct vnop_inactive_args *ap) (void) hfs_lock(cp, HFS_FORCE_LOCK); + /* + * Recycle named streams quickly so that the data fork vnode can + * go inactive in a timely manner (so that it can be zero filled + * or truncated if needed). + */ + if (vnode_isnamedstream(vp)) + recycle = 1; + /* * We should lock cnode before checking the flags in the * condition below and should unlock the cnode before calling @@ -219,9 +227,11 @@ hfs_vnop_inactive(struct vnop_inactive_args *ap) lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - if (cp->c_blocks > 0) - printf("hfs_inactive: attempting to delete a non-empty file!"); - + if (cp->c_blocks > 0) { + printf("hfs_inactive: deleting non-empty%sfile %d, " + "blks %d\n", VNODE_IS_RSRC(vp) ? " rsrc " : " ", + (int)cp->c_fileid, (int)cp->c_blocks); + } // // release the name pointer in the descriptor so that @@ -270,8 +280,15 @@ hfs_vnop_inactive(struct vnop_inactive_args *ap) hfs_volupdate(hfsmp, (v_type == VDIR) ? VOL_RMDIR : VOL_RMFILE, 0); } + /* + * A file may have had delayed allocations, in which case hfs_update + * would not have updated the catalog record (cat_update). We need + * to do that now, before we lose our fork data. We also need to + * force the update, or hfs_update will again skip the cat_update. + */ if ((cp->c_flag & C_MODIFIED) || cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime) { + cp->c_flag |= C_FORCEUPDATE; hfs_update(vp, 0); } out: @@ -388,6 +405,35 @@ hfs_vnop_reclaim(struct vnop_reclaim_args *ap) (void) hfs_lock(VTOC(vp), HFS_FORCE_LOCK); cp = VTOC(vp); + /* + * Check if a deleted resource fork vnode missed a + * VNOP_INACTIVE call and requires truncation. + */ + if (VNODE_IS_RSRC(vp) && + (cp->c_flag & C_DELETED) && + (VTOF(vp)->ff_blocks != 0)) { + hfs_unlock(cp); + ubc_setsize(vp, 0); + + hfs_lock_truncate(cp, TRUE); + (void) hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + + (void) hfs_truncate(vp, (off_t)0, IO_NDELAY, 1, ap->a_context); + + hfs_unlock_truncate(cp, TRUE); + } + /* + * A file may have had delayed allocations, in which case hfs_update + * would not have updated the catalog record (cat_update). We need + * to do that now, before we lose our fork data. We also need to + * force the update, or hfs_update will again skip the cat_update. + */ + if ((cp->c_flag & C_MODIFIED) || + cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime) { + cp->c_flag |= C_FORCEUPDATE; + hfs_update(vp, 0); + } + /* * Keep track of an inactive hot file. */ @@ -742,6 +788,16 @@ hfs_getnewvnode( if (cp->c_flag & C_HARDLINK) { vnode_setmultipath(vp); } + /* + * Tag resource fork vnodes as needing an VNOP_INACTIVE + * so that any deferred removes (open unlinked files) + * have the chance to process the resource fork. + */ + if (VNODE_IS_RSRC(vp)) { + /* Force VL_NEEDINACTIVE on this vnode */ + vnode_ref(vp); + vnode_rele(vp); + } hfs_chashwakeup(cp, H_ALLOC | H_ATTACH); /* diff --git a/bsd/hfs/hfs_link.c b/bsd/hfs/hfs_link.c index 65f5e9ee8..a2e08a098 100644 --- a/bsd/hfs/hfs_link.c +++ b/bsd/hfs/hfs_link.c @@ -1059,13 +1059,14 @@ __private_extern__ void hfs_relorigin(struct cnode *cp, cnid_t parentcnid) { - linkorigin_t *origin = NULL; + linkorigin_t *origin, *prev; void * thread = current_thread(); - TAILQ_FOREACH(origin, &cp->c_originlist, lo_link) { + TAILQ_FOREACH_SAFE(origin, &cp->c_originlist, lo_link, prev) { if ((origin->lo_thread == thread) || (origin->lo_parentcnid == parentcnid)) { TAILQ_REMOVE(&cp->c_originlist, origin, lo_link); + FREE(origin, M_TEMP); break; } } diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index d5a05045b..c0dc7253d 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2007 Apple Inc. All rights reserved. + * Copyright (c) 1999-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -2398,6 +2398,7 @@ hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) } else if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && (bcmp(nameptr, HFS_DELETE_PREFIX, HFS_DELETE_PREFIX_LEN) == 0)) { *vpp = NULL; + cat_releasedesc(&cndesc); return (ENOENT); /* open unlinked file */ } } @@ -3313,6 +3314,12 @@ out: VTOC(vp)->c_blocks = fp->ff_blocks; } + /* + Regardless of whether or not the totalblocks actually increased, + we should reset the allocLimit field. If it changed, it will + get updated; if not, it will remain the same. + */ + hfsmp->allocLimit = vcb->totalBlocks; hfs_systemfile_unlock(hfsmp, lockflags); hfs_end_transaction(hfsmp); @@ -4026,6 +4033,7 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) journal_fork.cf_extents[0].blockCount = newBlockCount; journal_fork.cf_blocks = newBlockCount; error = cat_update(hfsmp, &journal_desc, &journal_attr, &journal_fork, NULL); + cat_releasedesc(&journal_desc); /* all done with cat descriptor */ if (error) { printf("hfs_reclaim_journal_file: cat_update returned %d\n", error); goto free_fail; @@ -4140,6 +4148,7 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) jib_fork.cf_extents[0].blockCount = 1; jib_fork.cf_blocks = 1; error = cat_update(hfsmp, &jib_desc, &jib_attr, &jib_fork, NULL); + cat_releasedesc(&jib_desc); /* all done with cat descriptor */ if (error) { printf("hfs_reclaim_journal_info_block: cat_update returned %d\n", error); goto fail; diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index cac1f5b75..d8350638a 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1728,9 +1728,10 @@ hfs_vnop_remove(ap) hfs_lock_truncate(cp, TRUE); - if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) - goto out; - + if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) { + hfs_unlock_truncate(cp, TRUE); + return (error); + } error = hfs_removefile(dvp, vp, ap->a_cnp, ap->a_flags, 0, 0); // @@ -1748,9 +1749,14 @@ hfs_vnop_remove(ap) recycle_rsrc = 1; } - hfs_unlockpair(dcp, cp); -out: + /* + * Drop the truncate lock before unlocking the cnode + * (which can potentially perform a vnode_put and + * recycle the vnode which in turn might require the + * truncate lock) + */ hfs_unlock_truncate(cp, TRUE); + hfs_unlockpair(dcp, cp); if (recycle_rsrc && vnode_getwithvid(rvp, rvid) == 0) { vnode_recycle(rvp); @@ -1798,7 +1804,7 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int lockflags; int error = 0; int started_tr = 0; - int isbigfile = 0, hasxattrs=0, isdir=0; + int isbigfile = 0, defer_remove=0, isdir=0; cp = VTOC(vp); dcp = VTOC(dvp); @@ -1866,11 +1872,22 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, * (needed for hfs_truncate) */ if (isdir == 0 && (cp->c_blocks - VTOF(vp)->ff_blocks)) { - error = hfs_vgetrsrc(hfsmp, vp, &rvp, FALSE); - if (error) - goto out; - /* Defer the vnode_put on rvp until the hfs_unlock(). */ - cp->c_flag |= C_NEED_RVNODE_PUT; + /* + * We must avoid calling hfs_vgetrsrc() when we have + * an active resource fork vnode to avoid deadlocks + * when that vnode is in the VL_TERMINATE state. We + * can defer removing the file and its resource fork + * until the call to hfs_vnop_inactive() occurs. + */ + if (cp->c_rsrc_vp) { + defer_remove = 1; + } else { + error = hfs_vgetrsrc(hfsmp, vp, &rvp, FALSE); + if (error) + goto out; + /* Defer the vnode_put on rvp until the hfs_unlock(). */ + cp->c_flag |= C_NEED_RVNODE_PUT; + } } /* Check if this file is being used. */ if (isdir == 0) { @@ -1887,7 +1904,7 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, individual transactions in case there are too many */ if ((hfsmp->hfs_attribute_vp != NULL) && (cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0) { - hasxattrs = 1; + defer_remove = 1; } /* @@ -1976,10 +1993,10 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, /* * There are two cases to consider: - * 1. File is busy/big ==> move/rename the file + * 1. File is busy/big/defer_remove ==> move/rename the file * 2. File is not in use ==> remove the file */ - if (dataforkbusy || rsrcforkbusy || isbigfile || hasxattrs) { + if (dataforkbusy || rsrcforkbusy || isbigfile || defer_remove) { char delname[32]; struct cat_desc to_desc; struct cat_desc todir_desc; @@ -3191,6 +3208,7 @@ hfs_update(struct vnode *vp, __unused int waitfor) struct cat_fork *dataforkp = NULL; struct cat_fork *rsrcforkp = NULL; struct cat_fork datafork; + struct cat_fork rsrcfork; struct hfsmount *hfsmp; int lockflags; int error; @@ -3272,6 +3290,18 @@ hfs_update(struct vnode *vp, __unused int waitfor) dataforkp = &datafork; } + /* + * For resource forks with delayed allocations, make sure + * the block count and file size match the number of blocks + * actually allocated to the file on disk. + */ + if (rsrcforkp && (cp->c_rsrcfork->ff_unallocblocks != 0)) { + bcopy(rsrcforkp, &rsrcfork, sizeof(rsrcfork)); + rsrcfork.cf_blocks = (cp->c_rsrcfork->ff_blocks - cp->c_rsrcfork->ff_unallocblocks); + rsrcfork.cf_size = rsrcfork.cf_blocks * HFSTOVCB(hfsmp)->blockSize; + rsrcforkp = &rsrcfork; + } + /* * Lock the Catalog b-tree file. */ @@ -3585,6 +3615,7 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, int int error; int vid; +restart: /* Attempt to use exising vnode */ if ((rvp = cp->c_rsrc_vp)) { vid = vnode_vid(rvp); @@ -3607,15 +3638,22 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, int error = vnode_getwithvid(rvp, vid); - if (can_drop_lock) + if (can_drop_lock) { (void) hfs_lock(cp, HFS_FORCE_LOCK); - + /* + * When our lock was relinquished, the resource fork + * could have been recycled. Check for this and try + * again. + */ + if (error == ENOENT) + goto restart; + } if (error) { const char * name = (const char *)VTOC(vp)->c_desc.cd_nameptr; if (name) - printf("hfs_vgetrsrc: couldn't get" - " resource fork for %s\n", name); + printf("hfs_vgetrsrc: couldn't get resource" + " fork for %s, err %d\n", name, error); return (error); } } else { diff --git a/bsd/hfs/hfs_xattr.c b/bsd/hfs/hfs_xattr.c index 37dca768b..d025ae1cf 100644 --- a/bsd/hfs/hfs_xattr.c +++ b/bsd/hfs/hfs_xattr.c @@ -1504,6 +1504,8 @@ hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid) #endif hfs_systemfile_unlock(hfsmp, lockflags); hfs_end_transaction(hfsmp); + if (result) + break; } exit: FREE(iterator, M_TEMP); diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index 2ea3d6377..2a04d688c 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -569,6 +569,7 @@ bsd_init(void) /* * Initialize the calendar. */ + bsd_init_kprintf("calling IOKitInitializeTime\n"); IOKitInitializeTime(); if (turn_on_log_leaks && !new_nkdbufs) @@ -1031,7 +1032,9 @@ parse_bsd_args(void) if (PE_parse_boot_arg("nbuf", &max_nbuf_headers)) { customnbuf = 1; } +#if !defined(SECURE_KERNEL) PE_parse_boot_arg("kmem", &setup_kmem); +#endif PE_parse_boot_arg("trace", &new_nkdbufs); if (PE_parse_boot_arg("msgbuf", &msgbuf)) { diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index ded4a1dcf..6b2702d7b 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -1548,6 +1548,7 @@ int posix_spawn(proc_t ap, struct posix_spawn_args *uap, register_t *retval) { proc_t p = ap; /* quiet bogus GCC vfork() warning */ + user_addr_t pid = uap->pid; register_t ival[2]; /* dummy retval for vfork() */ struct image_params image_params, *imgp; struct vnode_attr va; @@ -1809,8 +1810,8 @@ bad: * * If the parent wants the pid, copy it out */ - if (uap->pid != USER_ADDR_NULL) - (void)suword(uap->pid, p->p_pid); + if (pid != USER_ADDR_NULL) + (void)suword(pid, p->p_pid); retval[0] = error; /* * Override inherited code signing flags with the diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index 7bc8b1d74..27f98defb 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -246,8 +246,7 @@ exit1(proc_t p, int rv, int *retval) } sig_lock_to_exit(p); } -#if !CONFIG_EMBEDDED /* BER_XXX */ - if (p->p_pid == 1) { + if (p == initproc) { proc_unlock(p); printf("pid 1 exited (signal %d, exit %d)", WTERMSIG(rv), WEXITSTATUS(rv)); @@ -257,7 +256,6 @@ exit1(proc_t p, int rv, int *retval) "launchd"), init_task_failure_data); } -#endif p->p_lflag |= P_LEXIT; p->p_xstat = rv; diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index 029fddc8e..27f0e0906 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -2415,23 +2415,26 @@ static int sysctl_nx (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { +#ifdef SECURE_KERNEL + return ENOTSUP; +#endif int new_value, changed; int error; error = sysctl_io_number(req, nx_enabled, sizeof(nx_enabled), &new_value, &changed); - if (error) - return error; + if (error) + return error; - if (changed) { + if (changed) { #ifdef __i386__ /* * Only allow setting if NX is supported on the chip */ if (!(cpuid_extfeatures() & CPUID_EXTFEATURE_XD)) - return ENOTSUP; + return ENOTSUP; #endif - nx_enabled = new_value; - } + nx_enabled = new_value; + } return(error); } diff --git a/bsd/kern/kpi_socketfilter.c b/bsd/kern/kpi_socketfilter.c index 377bb2e84..cefe30473 100644 --- a/bsd/kern/kpi_socketfilter.c +++ b/bsd/kern/kpi_socketfilter.c @@ -298,10 +298,19 @@ sflt_detach_private( if (!unregistering) { if ((entry->sfe_flags & SFEF_UNREGISTERING) != 0) { /* - * Another thread is unregistering the filter, we need to - * avoid detaching the filter here so the socket won't go - * away. + * Another thread is unregistering the filter, we + * need to avoid detaching the filter here so the + * socket won't go away. Bump up the socket's + * usecount so that it won't be freed until after + * the filter unregistration has been completed; + * at this point the caller has already held the + * socket's lock, so we can directly modify the + * usecount. */ + if (!(entry->sfe_flags & SFEF_DETACHXREF)) { + entry->sfe_socket->so_usecount++; + entry->sfe_flags |= SFEF_DETACHXREF; + } lck_mtx_unlock(sock_filter_lock); return; } @@ -322,9 +331,14 @@ sflt_detach_private( else { /* * Clear the removing flag. We will perform the detach here or - * request a delayed deatch. + * request a delayed detach. Since we do an extra ref release + * below, bump up the usecount if we haven't done so. */ entry->sfe_flags &= ~SFEF_UNREGISTERING; + if (!(entry->sfe_flags & SFEF_DETACHXREF)) { + entry->sfe_socket->so_usecount++; + entry->sfe_flags |= SFEF_DETACHXREF; + } } if (entry->sfe_socket->so_filteruse != 0) { @@ -510,10 +524,22 @@ sflt_unregister( filter->sf_flags |= SFF_DETACHING; for (next_entry = entry_head; next_entry; - next_entry = next_entry->sfe_next_onfilter) { - socket_lock(next_entry->sfe_socket, 1); + next_entry = next_entry->sfe_next_onfilter) { + /* + * Mark this as "unregistering"; upon dropping the + * lock, another thread may win the race and attempt + * to detach a socket from it (e.g. as part of close) + * before we get a chance to detach. Setting this + * flag practically tells the other thread to go away. + * If the other thread wins, this causes an extra + * reference hold on the socket so that it won't be + * deallocated until after we finish with the detach + * for it below. If we win the race, the extra + * reference hold is also taken to compensate for the + * extra reference release when detach is called + * with a "1" for its second parameter. + */ next_entry->sfe_flags |= SFEF_UNREGISTERING; - socket_unlock(next_entry->sfe_socket, 0); /* Radar 4201550: prevents the socket from being deleted while being unregistered */ } } diff --git a/bsd/kern/pthread_synch.c b/bsd/kern/pthread_synch.c index 980c1ad89..9ccbc9a9e 100644 --- a/bsd/kern/pthread_synch.c +++ b/bsd/kern/pthread_synch.c @@ -159,7 +159,7 @@ void _pthread_start(pthread_t self, mach_port_t kport, void *(*fun)(void *), voi #define PTHREAD_START_SETSCHED 0x02000000 #define PTHREAD_START_DETACHED 0x04000000 #define PTHREAD_START_POLICY_BITSHIFT 16 -#define PTHREAD_START_POLICY_MASK 0xffff +#define PTHREAD_START_POLICY_MASK 0xff #define PTHREAD_START_IMPORTANCE_MASK 0xffff #define SCHED_OTHER POLICY_TIMESHARE @@ -958,7 +958,8 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args *uap, us extinfo.timeshare = 0; thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT); - precedinfo.importance = importance; +#define BASEPRI_DEFAULT 31 + precedinfo.importance = (importance - BASEPRI_DEFAULT); thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT); } diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index 0fe948aae..509468087 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -224,7 +224,7 @@ pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *re int error; if ( (error = preparefileread(p, &fp, fd, 1)) ) - return (error); + goto out; error = dofileread(vfs_context_current(), fp, uap->buf, uap->nbyte, uap->offset, FOF_OFFSET, retval); @@ -234,7 +234,8 @@ pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *re if (!error) KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE), uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0); - + +out: return (error); } diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index e178c29b4..7b259ec9f 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -837,8 +837,12 @@ soclose_wait_locked(struct socket *so) mutex_held = so->so_proto->pr_domain->dom_mtx; lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); - /* Double check here and return if there's no outstanding upcall */ - if (!(so->so_flags & SOF_UPCALLINUSE)) + /* + * Double check here and return if there's no outstanding upcall; + * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set. + */ + if (!(so->so_flags & SOF_UPCALLINUSE) || + !(so->so_flags & SOF_UPCALLCLOSEWAIT)) return; so->so_flags |= SOF_CLOSEWAIT; @@ -3195,6 +3199,19 @@ sosetopt(struct socket *so, struct sockopt *sopt) #endif /* MAC_SOCKET */ break; +#ifdef __APPLE_API_PRIVATE + case SO_UPCALLCLOSEWAIT: + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error) + goto bad; + if (optval) + so->so_flags |= SOF_UPCALLCLOSEWAIT; + else + so->so_flags &= ~SOF_UPCALLCLOSEWAIT; + break; +#endif + default: error = ENOPROTOOPT; break; @@ -3463,6 +3480,12 @@ integer: #endif /* MAC_SOCKET */ break; +#ifdef __APPLE_API_PRIVATE + case SO_UPCALLCLOSEWAIT: + optval = (so->so_flags & SOF_UPCALLCLOSEWAIT); + goto integer; +#endif + default: error = ENOPROTOOPT; break; diff --git a/bsd/kern/uipc_socket2.c b/bsd/kern/uipc_socket2.c index 379b9afd6..41a606ca3 100644 --- a/bsd/kern/uipc_socket2.c +++ b/bsd/kern/uipc_socket2.c @@ -843,6 +843,7 @@ sbappendrecord(struct sockbuf *sb, struct mbuf *m0) sb->sb_mb = m0; } sb->sb_lastrecord = m0; + sb->sb_mbtail = m0; m = m0->m_next; m0->m_next = 0; diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c index 1126e7955..7e9cafa35 100644 --- a/bsd/kern/uipc_syscalls.c +++ b/bsd/kern/uipc_syscalls.c @@ -137,9 +137,9 @@ static int sendit(struct proc *, int, struct user_msghdr *, uio_t, int, static int recvit(struct proc *, int, struct user_msghdr *, uio_t, user_addr_t, register_t *); static int getsockaddr(struct socket *, struct sockaddr **, user_addr_t, - size_t); + size_t, boolean_t); static int getsockaddr_s(struct socket *, struct sockaddr_storage *, - user_addr_t, size_t); + user_addr_t, size_t, boolean_t); #if SENDFILE static void alloc_sendpkt(int, size_t, unsigned int *, struct mbuf **, boolean_t); @@ -251,9 +251,9 @@ bind(__unused proc_t p, struct bind_args *uap, __unused register_t *retval) goto out; } if (uap->namelen > sizeof (ss)) { - error = getsockaddr(so, &sa, uap->name, uap->namelen); + error = getsockaddr(so, &sa, uap->name, uap->namelen, TRUE); } else { - error = getsockaddr_s(so, &ss, uap->name, uap->namelen); + error = getsockaddr_s(so, &ss, uap->name, uap->namelen, TRUE); if (error == 0) { sa = (struct sockaddr *)&ss; want_free = FALSE; @@ -595,6 +595,7 @@ connect_nocancel(__unused proc_t p, struct connect_nocancel_args *uap, __unused boolean_t want_free = TRUE; int error; int fd = uap->s; + boolean_t dgram; AUDIT_ARG(fd, uap->s); error = file_socket(fd, &so); @@ -605,11 +606,17 @@ connect_nocancel(__unused proc_t p, struct connect_nocancel_args *uap, __unused goto out; } + /* + * Ask getsockaddr{_s} to not translate AF_UNSPEC to AF_INET + * if this is a datagram socket; translate for other types. + */ + dgram = (so->so_type == SOCK_DGRAM); + /* Get socket address now before we obtain socket lock */ if (uap->namelen > sizeof (ss)) { - error = getsockaddr(so, &sa, uap->name, uap->namelen); + error = getsockaddr(so, &sa, uap->name, uap->namelen, !dgram); } else { - error = getsockaddr_s(so, &ss, uap->name, uap->namelen); + error = getsockaddr_s(so, &ss, uap->name, uap->namelen, !dgram); if (error == 0) { sa = (struct sockaddr *)&ss; want_free = FALSE; @@ -827,10 +834,10 @@ sendit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, if (mp->msg_name != USER_ADDR_NULL) { if (mp->msg_namelen > sizeof (ss)) { error = getsockaddr(so, &to, mp->msg_name, - mp->msg_namelen); + mp->msg_namelen, TRUE); } else { error = getsockaddr_s(so, &ss, mp->msg_name, - mp->msg_namelen); + mp->msg_namelen, TRUE); if (error == 0) { to = (struct sockaddr *)&ss; want_free = FALSE; @@ -1840,7 +1847,7 @@ sockargs(struct mbuf **mp, user_addr_t data, int buflen, int type) */ static int getsockaddr(struct socket *so, struct sockaddr **namp, user_addr_t uaddr, - size_t len) + size_t len, boolean_t translate_unspec) { struct sockaddr *sa; int error; @@ -1865,7 +1872,7 @@ getsockaddr(struct socket *so, struct sockaddr **namp, user_addr_t uaddr, * sockets we leave it unchanged and let the lower layer * handle it. */ - if (sa->sa_family == AF_UNSPEC && + if (translate_unspec && sa->sa_family == AF_UNSPEC && INP_CHECK_SOCKAF(so, AF_INET) && len == sizeof (struct sockaddr_in)) sa->sa_family = AF_INET; @@ -1878,7 +1885,7 @@ getsockaddr(struct socket *so, struct sockaddr **namp, user_addr_t uaddr, static int getsockaddr_s(struct socket *so, struct sockaddr_storage *ss, - user_addr_t uaddr, size_t len) + user_addr_t uaddr, size_t len, boolean_t translate_unspec) { int error; @@ -1902,7 +1909,7 @@ getsockaddr_s(struct socket *so, struct sockaddr_storage *ss, * sockets we leave it unchanged and let the lower layer * handle it. */ - if (ss->ss_family == AF_UNSPEC && + if (translate_unspec && ss->ss_family == AF_UNSPEC && INP_CHECK_SOCKAF(so, AF_INET) && len == sizeof (struct sockaddr_in)) ss->ss_family = AF_INET; diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index 47690269a..e3b16f486 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -310,6 +310,11 @@ dlil_write_end(void) static int proto_hash_value(u_long protocol_family) { + /* + * dlil_proto_unplumb_all() depends on the mapping between + * the hash bucket index and the protocol family defined + * here; future changes must be applied there as well. + */ switch(protocol_family) { case PF_INET: return 0; @@ -2852,3 +2857,24 @@ dlil_if_release( ifnet_lock_done(ifp); } + +__private_extern__ void +dlil_proto_unplumb_all(struct ifnet *ifp) +{ + /* + * if_proto_hash[0-3] are for PF_INET, PF_INET6, PF_APPLETALK + * and PF_VLAN, where each bucket contains exactly one entry; + * PF_VLAN does not need an explicit unplumb. + * + * if_proto_hash[4] is for other protocols; we expect anything + * in this bucket to respond to the DETACHING event (which would + * have happened by now) and do the unplumb then. + */ + (void) proto_unplumb(PF_INET, ifp); +#if INET6 + (void) proto_unplumb(PF_INET6, ifp); +#endif /* INET6 */ +#if NETAT + (void) proto_unplumb(PF_APPLETALK, ifp); +#endif /* NETAT */ +} diff --git a/bsd/net/dlil.h b/bsd/net/dlil.h index 3f19f7108..6e3872b79 100644 --- a/bsd/net/dlil.h +++ b/bsd/net/dlil.h @@ -161,6 +161,7 @@ int dlil_attach_filter(ifnet_t ifp, const struct iff_filter *if_filter, interface_filter_t *filter_ref); void dlil_detach_filter(interface_filter_t filter); int dlil_detach_protocol(ifnet_t ifp, u_long protocol); +extern void dlil_proto_unplumb_all(ifnet_t); #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/net/if.c b/bsd/net/if.c index 04b3cadf6..499b4790c 100644 --- a/bsd/net/if.c +++ b/bsd/net/if.c @@ -2048,13 +2048,14 @@ if_down_all(void) u_int32_t count; u_int32_t i; - if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp, &count) != 0) { + if (ifnet_list_get_all(IFNET_FAMILY_ANY, &ifp, &count) == 0) { for (i = 0; i < count; i++) { if_down(ifp[i]); + dlil_proto_unplumb_all(ifp[i]); } ifnet_list_free(ifp); } - + return 0; } diff --git a/bsd/net/kext_net.h b/bsd/net/kext_net.h index b7b98dd00..6215515a3 100644 --- a/bsd/net/kext_net.h +++ b/bsd/net/kext_net.h @@ -48,8 +48,9 @@ struct socket_filter; -#define SFEF_DETACHUSEZERO 0x1 // Detach when use reaches zero -#define SFEF_UNREGISTERING 0x2 // Remove due to unregister +#define SFEF_DETACHUSEZERO 0x1 /* Detach when use reaches zero */ +#define SFEF_UNREGISTERING 0x2 /* Remove due to unregister */ +#define SFEF_DETACHXREF 0x4 /* Extra reference held for detach */ struct socket_filter_entry { struct socket_filter_entry *sfe_next_onsocket; diff --git a/bsd/net/kpi_interface.c b/bsd/net/kpi_interface.c index 1878cde46..d9dfca3f3 100644 --- a/bsd/net/kpi_interface.c +++ b/bsd/net/kpi_interface.c @@ -56,6 +56,9 @@ extern struct dlil_threading_info *dlil_lo_thread_ptr; extern int dlil_multithreaded_input; +static errno_t +ifnet_list_get_common(ifnet_family_t, boolean_t, ifnet_t **, u_int32_t *); + /* Temporary work around until we have real reference counting @@ -1084,42 +1087,55 @@ ifnet_find_by_name( } errno_t -ifnet_list_get( - ifnet_family_t family, - ifnet_t **list, - u_int32_t *count) +ifnet_list_get(ifnet_family_t family, ifnet_t **list, u_int32_t *count) +{ + return (ifnet_list_get_common(family, FALSE, list, count)); +} + +__private_extern__ errno_t +ifnet_list_get_all(ifnet_family_t family, ifnet_t **list, u_int32_t *count) +{ + return (ifnet_list_get_common(family, TRUE, list, count)); +} + +static errno_t +ifnet_list_get_common(ifnet_family_t family, boolean_t get_all, ifnet_t **list, + u_int32_t *count) { struct ifnet *ifp; u_int32_t cmax = 0; *count = 0; errno_t result = 0; - - if (list == NULL || count == NULL) return EINVAL; - + + if (list == NULL || count == NULL) + return (EINVAL); + ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet, if_link) - { - if (ifp->if_eflags & IFEF_DETACHING) continue; - if (family == 0 || ifp->if_family == family) + TAILQ_FOREACH(ifp, &ifnet, if_link) { + if ((ifp->if_eflags & IFEF_DETACHING) && !get_all) + continue; + if (family == IFNET_FAMILY_ANY || ifp->if_family == family) cmax++; } - + if (cmax == 0) result = ENXIO; - + if (result == 0) { - MALLOC(*list, ifnet_t*, sizeof(ifnet_t) * (cmax + 1), M_TEMP, M_NOWAIT); + MALLOC(*list, ifnet_t*, sizeof(ifnet_t) * (cmax + 1), + M_TEMP, M_NOWAIT); if (*list == NULL) result = ENOMEM; } if (result == 0) { - TAILQ_FOREACH(ifp, &ifnet, if_link) - { - if (ifp->if_eflags & IFEF_DETACHING) continue; - if (*count + 1 > cmax) break; - if (family == 0 || ((ifnet_family_t)ifp->if_family) == family) - { + TAILQ_FOREACH(ifp, &ifnet, if_link) { + if ((ifp->if_eflags & IFEF_DETACHING) && !get_all) + continue; + if (*count + 1 > cmax) + break; + if (family == IFNET_FAMILY_ANY || + ((ifnet_family_t)ifp->if_family) == family) { (*list)[*count] = (ifnet_t)ifp; ifnet_reference((*list)[*count]); (*count)++; @@ -1128,23 +1144,22 @@ ifnet_list_get( (*list)[*count] = NULL; } ifnet_head_done(); - - return 0; + + return (result); } void -ifnet_list_free( - ifnet_t *interfaces) +ifnet_list_free(ifnet_t *interfaces) { int i; - - if (interfaces == NULL) return; - - for (i = 0; interfaces[i]; i++) - { + + if (interfaces == NULL) + return; + + for (i = 0; interfaces[i]; i++) { ifnet_release(interfaces[i]); } - + FREE(interfaces, M_TEMP); } diff --git a/bsd/net/kpi_interface.h b/bsd/net/kpi_interface.h index 8a0cd2b7c..dd3101b4a 100644 --- a/bsd/net/kpi_interface.h +++ b/bsd/net/kpi_interface.h @@ -1505,6 +1505,25 @@ errno_t ifnet_find_by_name(const char *ifname, ifnet_t *interface); */ errno_t ifnet_list_get(ifnet_family_t family, ifnet_t **interfaces, u_int32_t *count); +#ifdef KERNEL_PRIVATE +/*! + @function ifnet_list_get_all + @discussion Get a list of attached interfaces. List will be set to + point to an array allocated by ifnet_list_get. The interfaces + are refcounted and the counts will be incremented before the + function returns. The list of interfaces must be freed using + ifnet_list_free. This is similar to ifnet_list_get, except + that it includes interfaces that are detaching. + @param family The interface family (i.e. IFNET_FAMILY_ETHERNET). To + find interfaces of all families, use IFNET_FAMILY_ANY. + @param interfaces A pointer to an array of interface references. + @param count A pointer that will be filled in with the number of + matching interfaces in the array. + @result 0 on success otherwise the errno error. + */ +errno_t ifnet_list_get_all(ifnet_family_t family, ifnet_t **interfaces, u_int32_t *count); +#endif /* KERNEL_PRIVATE */ + /*! @function ifnet_list_free @discussion Free a list of interfaces returned by ifnet_list_get. diff --git a/bsd/net/kpi_protocol.c b/bsd/net/kpi_protocol.c index 8b3614a49..9b63ec840 100644 --- a/bsd/net/kpi_protocol.c +++ b/bsd/net/kpi_protocol.c @@ -266,6 +266,7 @@ proto_input_run(void) } } if (locked) { + locked = 0; lck_mtx_unlock(entry->domain->dom_mtx); } } diff --git a/bsd/net/route.c b/bsd/net/route.c index ffd62033f..e00ce3eaa 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -382,7 +382,7 @@ rtfree_locked(struct rtentry *rt) * close routine typically issues RTM_DELETE which clears the RTF_UP * flag on the entry so that the code below reclaims the storage. */ - if (rnh->rnh_close && rt->rt_refcnt == 0) + if (rnh && rnh->rnh_close && rt->rt_refcnt == 0) rnh->rnh_close((struct radix_node *)rt, rnh); /* diff --git a/bsd/netinet/igmp.c b/bsd/netinet/igmp.c index 961549d60..1889c7125 100644 --- a/bsd/netinet/igmp.c +++ b/bsd/netinet/igmp.c @@ -110,7 +110,7 @@ static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); #endif static struct router_info * - find_rti(struct ifnet *ifp); + find_rti(struct ifnet *ifp, int wait); static struct igmpstat igmpstat; @@ -155,7 +155,7 @@ igmp_init(void) static struct router_info * find_rti( - struct ifnet *ifp) + struct ifnet *ifp, int wait) { struct router_info *rti = Head; @@ -173,7 +173,7 @@ find_rti( rti = rti->rti_next; } - MALLOC(rti, struct router_info *, sizeof *rti, M_IGMP, M_NOWAIT); + MALLOC(rti, struct router_info *, sizeof *rti, M_IGMP, wait); if (rti != NULL) { rti->rti_ifp = ifp; @@ -243,7 +243,7 @@ igmp_input( timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; - rti = find_rti(ifp); + rti = find_rti(ifp, M_NOWAIT); if (rti == NULL) { m_freem(m); return; @@ -398,7 +398,7 @@ igmp_joingroup(struct in_multi *inm) inm->inm_timer = 0; inm->inm_state = IGMP_OTHERMEMBER; } else { - inm->inm_rti = find_rti(inm->inm_ifp); + inm->inm_rti = find_rti(inm->inm_ifp, M_WAITOK); if (inm->inm_rti == NULL) return ENOMEM; igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); inm->inm_timer = IGMP_RANDOM_DELAY( @@ -438,7 +438,7 @@ igmp_fasttimo(void) while (inm != NULL) { if (inm->inm_timer == 0) { /* do nothing */ - } else if (--inm->inm_timer == 0) { + } else if ((--inm->inm_timer == 0) && (inm->inm_rti != NULL)) { igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); inm->inm_state = IGMP_IREPORTEDLAST; } else { diff --git a/bsd/netinet/in.h b/bsd/netinet/in.h index 8ed004c82..7f23a9e6a 100644 --- a/bsd/netinet/in.h +++ b/bsd/netinet/in.h @@ -440,7 +440,7 @@ struct ip_opts { #define IP_TRAFFIC_MGT_BACKGROUND 65 /* int*; get background IO flags; set background IO */ -#if CONFIG_FORCE_OUT_IFP +#ifdef PRIVATE /* This is a hack, this is only a hack. */ #define IP_FORCE_OUT_IFP 69 /* char ifname[] - send traffic on this interface */ #endif diff --git a/bsd/netinet/ip_fw2.h b/bsd/netinet/ip_fw2.h index cd1514ffd..1e36b65a9 100644 --- a/bsd/netinet/ip_fw2.h +++ b/bsd/netinet/ip_fw2.h @@ -432,6 +432,7 @@ struct _ipfw_dyn_rule { * Main firewall chains definitions and global var's definitions. */ #ifdef KERNEL +#if IPFIREWALL #define IP_FW_PORT_DYNT_FLAG 0x10000 #define IP_FW_PORT_TEE_FLAG 0x20000 @@ -457,6 +458,7 @@ struct ip_fw_args { u_int16_t divert_rule; /* divert cookie */ u_int32_t retval; }; +//struct ip_fw_args; /* * Function definitions. @@ -476,6 +478,7 @@ extern ip_fw_ctl_t *ip_fw_ctl_ptr; extern int fw_one_pass; extern int fw_enable; #define IPFW_LOADED (ip_fw_chk_ptr != NULL) +#endif /* IPFIREWALL */ #endif /* KERNEL */ #endif /* !__LP64__ */ diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 225164fd6..8743d9178 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -258,6 +258,7 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, /* Firewall hooks */ +#if IPFIREWALL ip_fw_chk_t *ip_fw_chk_ptr; int fw_enable = 1; int fw_bypass = 1; @@ -268,6 +269,7 @@ ip_dn_io_t *ip_dn_io_ptr; #endif int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **) = NULL; +#endif /* IPFIREWALL */ SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "link local"); @@ -531,7 +533,9 @@ ip_input(struct mbuf *m) u_short sum; struct in_addr pkt_dst; u_int32_t div_info = 0; /* packet divert/tee info */ +#if IPFIREWALL struct ip_fw_args args; +#endif ipfilter_t inject_filter_ref = 0; struct m_tag *tag; struct route ipforward_rt; @@ -557,6 +561,7 @@ ip_input(struct mbuf *m) } #endif /* DUMMYNET */ +#if IPDIVERT if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { struct divert_tag *div_tag; @@ -565,6 +570,8 @@ ip_input(struct mbuf *m) m_tag_delete(m, tag); } +#endif + if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { struct ip_fwd_tag *ipfwd_tag; @@ -815,7 +822,11 @@ pass: * to be sent and the original packet to be freed). */ ip_nhops = 0; /* for source routed packets */ +#if IPFIREWALL if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, args.next_hop, &ipforward_rt)) { +#else + if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, NULL, &ipforward_rt)) { +#endif return; } @@ -842,8 +853,12 @@ pass: * Cache the destination address of the packet; this may be * changed by use of 'ipfw fwd'. */ +#if IPFIREWALL pkt_dst = args.next_hop == NULL ? ip->ip_dst : args.next_hop->sin_addr; +#else + pkt_dst = ip->ip_dst; +#endif /* * Enable a consistency check between the destination address @@ -860,8 +875,12 @@ pass: * the packets are received. */ checkif = ip_checkinterface && (ipforwarding == 0) && - ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) && - (args.next_hop == NULL); + ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) +#if IPFIREWALL + && (args.next_hop == NULL); +#else + ; +#endif lck_mtx_lock(rt_mtx); TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { @@ -989,7 +1008,11 @@ pass: OSAddAtomic(1, (SInt32*)&ipstat.ips_cantforward); m_freem(m); } else { +#if IPFIREWALL ip_forward(m, 0, args.next_hop, &ipforward_rt); +#else + ip_forward(m, 0, NULL, &ipforward_rt); +#endif if (ipforward_rt.ro_rt != NULL) { rtfree(ipforward_rt.ro_rt); ipforward_rt.ro_rt = NULL; @@ -1184,6 +1207,7 @@ found: */ OSAddAtomic(1, (SInt32*)&ipstat.ips_delivered); { +#if IPFIREWALL if (args.next_hop && ip->ip_p == IPPROTO_TCP) { /* TCP needs IPFORWARD info if available */ struct m_tag *fwd_tag; @@ -1212,6 +1236,9 @@ found: ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); } +#else + ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); +#endif return; } diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index c065797e7..db39fe174 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -249,7 +249,9 @@ ip_output_list( #if IPFIREWALL_FORWARD int fwd_rewrite_src = 0; #endif +#if IPFIREWALL struct ip_fw_args args; +#endif int didfilter = 0; ipfilter_t inject_filter_ref = 0; struct m_tag *tag; @@ -261,8 +263,8 @@ ip_output_list( KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); packetlist = m0; - args.next_hop = NULL; #if IPFIREWALL + args.next_hop = NULL; args.eh = NULL; args.rule = NULL; args.divert_rule = 0; /* divert cookie */ @@ -297,7 +299,6 @@ ip_output_list( m_tag_delete(m0, tag); } #endif /* IPDIVERT */ -#endif /* IPFIREWALL */ if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { struct ip_fwd_tag *ipfwd_tag; @@ -307,6 +308,7 @@ ip_output_list( m_tag_delete(m0, tag); } +#endif /* IPFIREWALL */ m = m0; @@ -356,7 +358,11 @@ loopit: hlen = len; } ip = mtod(m, struct ip *); +#if IPFIREWALL pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; +#else + pkt_dst = ip->ip_dst; +#endif /* * Fill in IP header. diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index df763aac5..e30687513 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -125,10 +125,12 @@ struct inpcbhead ripcb; struct inpcbinfo ripcbinfo; /* control hooks for ipfw and dummynet */ +#if IPFIREWALL ip_fw_ctl_t *ip_fw_ctl_ptr; #if DUMMYNET ip_dn_ctl_t *ip_dn_ctl_ptr; #endif /* DUMMYNET */ +#endif /* IPFIREWALL */ /* * Nominal space allocated to a raw ip socket. diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 39a5fc252..302ab9431 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -1462,13 +1462,6 @@ findpcb: * Grow the congestion window, if the * connection is cwnd bound. */ - if (tp->snd_cwnd < tp->snd_wnd) { - tp->t_bytes_acked += acked; - if (tp->t_bytes_acked > tp->snd_cwnd) { - tp->t_bytes_acked -= tp->snd_cwnd; - tp->snd_cwnd += tp->t_maxseg; - } - } sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) @@ -1794,7 +1787,6 @@ findpcb: tp->ecn_flags &= ~TE_SENDIPECT; } - soisconnected(so); #if CONFIG_MACF_NET && CONFIG_MACF_SOCKET /* XXXMAC: recursive lock: SOCK_LOCK(so); */ mac_socketpeer_label_associate_mbuf(m, so); @@ -1835,6 +1827,10 @@ findpcb: tp->t_state = TCPS_ESTABLISHED; tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); } + /* soisconnected may lead to socket_unlock in case of upcalls, + * make sure this is done when everything is setup. + */ + soisconnected(so); } else { /* * Received initial SYN in SYN-SENT[*] state => simul- @@ -2223,7 +2219,6 @@ trimthenstep6: case TCPS_SYN_RECEIVED: tcpstat.tcps_connects++; - soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == @@ -2252,8 +2247,14 @@ trimthenstep6: (void) tcp_reass(tp, (struct tcphdr *)0, &tlen, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; + /* FALLTHROUGH */ + /* soisconnected may lead to socket_unlock in case of upcalls, + * make sure this is done when everything is setup. + */ + soisconnected(so); + /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range @@ -2542,30 +2543,45 @@ process_ACK: register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; - if (cw >= tp->snd_ssthresh) { - tp->t_bytes_acked += acked; - if (tp->t_bytes_acked >= cw) { + if ((acked > incr) && tcp_do_rfc3465) { + if (cw >= tp->snd_ssthresh) { + tp->t_bytes_acked += acked; + if (tp->t_bytes_acked >= cw) { /* Time to increase the window. */ - tp->t_bytes_acked -= cw; - } else { + tp->t_bytes_acked -= cw; + } else { /* No need to increase yet. */ - incr = 0; + incr = 0; + } + } else { + /* + * If the user explicitly enables RFC3465 + * use 2*SMSS for the "L" param. Otherwise + * use the more conservative 1*SMSS. + * + * (See RFC 3465 2.3 Choosing the Limit) + */ + u_int abc_lim; + + abc_lim = (tcp_do_rfc3465 == 0) ? + incr : incr * 2; + incr = lmin(acked, abc_lim); } - } else { + } + else { /* - * If the user explicitly enables RFC3465 - * use 2*SMSS for the "L" param. Otherwise - * use the more conservative 1*SMSS. - * - * (See RFC 3465 2.3 Choosing the Limit) + * If the window gives us less than ssthresh packets + * in flight, open exponentially (segsz per packet). + * Otherwise open linearly: segsz per window + * (segsz^2 / cwnd per packet). */ - u_int abc_lim; - - abc_lim = (tcp_do_rfc3465 == 0) ? - incr : incr * 2; - incr = min(acked, abc_lim); + + if (cw >= tp->snd_ssthresh) { + incr = incr * incr / cw; + } } + tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); } if (acked > so->so_snd.sb_cc) { @@ -2577,7 +2593,6 @@ process_ACK: tp->snd_wnd -= acked; ourfinisacked = 0; } - sowwakeup(so); /* detect una wraparound */ if ((tcp_do_newreno || tp->sack_enable) && !IN_FASTRECOVERY(tp) && @@ -2595,6 +2610,12 @@ process_ACK: } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; + + /* + * sowwakeup must happen after snd_una, et al. are updated so that + * the sequence numbers are in sync with so_snd + */ + sowwakeup(so); switch (tp->t_state) { @@ -2613,9 +2634,9 @@ process_ACK: * we'll hang forever. */ if (so->so_state & SS_CANTRCVMORE) { - soisdisconnected(so); tp->t_timer[TCPT_2MSL] = tcp_maxidle; add_to_time_wait(tp); + soisdisconnected(so); } tp->t_state = TCPS_FIN_WAIT_2; goto drop; diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index 250d4a2d6..db82d4d90 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -167,8 +167,10 @@ extern int ipsec_bypass; extern int slowlink_wsize; /* window correction for slow links */ extern u_long route_generation; +#if IPFIREWALL extern int fw_enable; /* firewall check for packet chaining */ extern int fw_bypass; /* firewall check: disable packet chaining if there is rules */ +#endif /* IPFIREWALL */ extern vm_size_t so_cache_zone_element_size; @@ -677,10 +679,19 @@ after_sack_rexmit: long adv = lmin(recwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); - if (adv >= (long) (2 * tp->t_maxseg)) - goto send; - if (2 * adv >= (long) so->so_rcv.sb_hiwat) - goto send; + if (adv >= (long) (2 * tp->t_maxseg)) { + + /* + * Update only if the resulting scaled value of the window changed, or + * if there is a change in the sequence since the last ack. + * This avoids what appears as dupe ACKS (see rdar://5640997) + */ + + if ((tp->last_ack_sent != tp->rcv_nxt) || (((recwin + adv) >> tp->rcv_scale) > recwin)) + goto send; + } + if (2 * adv >= (long) so->so_rcv.sb_hiwat) + goto send; } /* @@ -1239,6 +1250,8 @@ send: tp->sackhint.sack_bytes_rexmit += len; } th->th_ack = htonl(tp->rcv_nxt); + tp->last_ack_sent = tp->rcv_nxt; + if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; @@ -1623,6 +1636,11 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, boolean_t chain; boolean_t unlocked = FALSE; + /* Make sure ACK/DELACK conditions are cleared before + * we unlock the socket. + */ + + tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); /* * If allowed, unlock TCP socket while in IP * but only if the connection is established and @@ -1642,11 +1660,15 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, * - there is a non default rule set for the firewall */ - chain = tcp_packet_chaining > 1 && + chain = tcp_packet_chaining > 1 #if IPSEC - ipsec_bypass && + && ipsec_bypass +#endif +#if IPFIREWALL + && (fw_enable == 0 || fw_bypass) #endif - (fw_enable == 0 || fw_bypass); + ; // I'm important, not extraneous + while (pkt != NULL) { struct mbuf *npkt = pkt->m_nextpkt; diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index 739e2816b..833caaf4c 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -358,7 +358,7 @@ static int bg_cnt = 0; void tcp_slowtimo() { - struct inpcb *inp; + struct inpcb *inp, *nxt; struct tcpcb *tp; struct socket *so; int i; @@ -537,12 +537,12 @@ twunlock: } - LIST_FOREACH(inp, &tcb, inp_list) { + LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) { tcp_garbage_collect(inp, 0); } /* Now cleanup the time wait ones */ - LIST_FOREACH(inp, &time_wait_slots[cur_tw_slot], inp_list) { + LIST_FOREACH_SAFE(inp, &time_wait_slots[cur_tw_slot], inp_list, nxt) { tcp_garbage_collect(inp, 1); } diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index 1bee938d4..f0c838d58 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -104,7 +104,7 @@ int nd6_debug = 0; static int nd6_inuse, nd6_allocated; struct llinfo_nd6 llinfo_nd6 = {&llinfo_nd6, &llinfo_nd6, NULL, NULL, 0, 0, 0, 0, 0 }; -size_t nd_ifinfo_indexlim = 8; +size_t nd_ifinfo_indexlim = 32; /* increased for 5589193 */ struct nd_ifinfo *nd_ifinfo = NULL; struct nd_drhead nd_defrouter; struct nd_prhead nd_prefix = { 0 }; @@ -166,7 +166,13 @@ nd6_ifattach( bzero(q, n); if (nd_ifinfo) { bcopy((caddr_t)nd_ifinfo, q, n/2); + /* Radar 5589193: + * SU fix purposely leaks the old nd_ifinfo array + * if we grow the arraw to more than 32 interfaces + * Fix for future release is to use proper locking. + FREE((caddr_t)nd_ifinfo, M_IP6NDP); + */ } nd_ifinfo = (struct nd_ifinfo *)q; } diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index fdb3ae1c5..48694c9ee 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -318,6 +318,8 @@ nfs_connect(struct nfsmount *nmp) lck_mtx_unlock(&nmp->nm_lock); goto bad; } + /* just playin' it safe */ + sock_setsockopt(so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); if (!(nmp->nm_flag & NFSMNT_INT)) sock_nointerrupt(so, 1); diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index 1a4f5bb0e..9d4d4f309 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -781,6 +781,8 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) so->so_upcall = nfsrv_rcv; so->so_rcv.sb_flags |= SB_UPCALL; socket_unlock(so, 1); + /* just playin' it safe */ + sock_setsockopt(so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); /* mark that the socket is not in the nfsrv_sockwg list */ slp->ns_wgq.tqe_next = SLPNOLIST; diff --git a/bsd/nfs/nfsm_subs.h b/bsd/nfs/nfsm_subs.h index 3d6d36b65..2daeebc9c 100644 --- a/bsd/nfs/nfsm_subs.h +++ b/bsd/nfs/nfsm_subs.h @@ -513,11 +513,13 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); /* get a pointer to the next consecutive bytes in an mbuf chain */ #define nfsm_chain_get_opaque_pointer(E, NMC, LEN, PTR) \ do { \ + uint32_t rndlen; \ if (E) break; \ - if ((NMC)->nmc_left >= (uint32_t)(LEN)) { \ + rndlen = nfsm_rndup(LEN); \ + if ((NMC)->nmc_left >= rndlen) { \ (PTR) = (void*)(NMC)->nmc_ptr; \ - (NMC)->nmc_left -= nfsm_rndup(LEN); \ - (NMC)->nmc_ptr += nfsm_rndup(LEN); \ + (NMC)->nmc_left -= rndlen; \ + (NMC)->nmc_ptr += rndlen; \ } else { \ (E) = nfsm_chain_get_opaque_pointer_f((NMC), (LEN), (u_char**)&(PTR)); \ } \ @@ -526,11 +528,13 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); /* copy the next consecutive bytes of opaque data from an mbuf chain */ #define nfsm_chain_get_opaque(E, NMC, LEN, PTR) \ do { \ + uint32_t rndlen; \ if (E) break; \ - if ((NMC)->nmc_left >= (LEN)) { \ + rndlen = nfsm_rndup(LEN); \ + if ((NMC)->nmc_left >= rndlen) { \ u_char *__tmpptr = (u_char*)(NMC)->nmc_ptr; \ - (NMC)->nmc_left -= nfsm_rndup(LEN); \ - (NMC)->nmc_ptr += nfsm_rndup(LEN); \ + (NMC)->nmc_left -= rndlen; \ + (NMC)->nmc_ptr += rndlen; \ bcopy(__tmpptr, (PTR), (LEN)); \ } else { \ (E) = nfsm_chain_get_opaque_f((NMC), (LEN), (u_char*)(PTR)); \ diff --git a/bsd/sys/aio.h b/bsd/sys/aio.h index 938583373..bb0a7d7c5 100644 --- a/bsd/sys/aio.h +++ b/bsd/sys/aio.h @@ -75,7 +75,7 @@ struct aiocb { struct user_aiocb { int aio_fildes; /* File descriptor */ - off_t aio_offset; /* File offset */ + off_t aio_offset __attribute((aligned(8))); /* File offset */ user_addr_t aio_buf __attribute((aligned(8))); /* Location of buffer */ user_size_t aio_nbytes; /* Length of transfer */ int aio_reqprio; /* Request priority offset */ diff --git a/bsd/sys/dtrace_glue.h b/bsd/sys/dtrace_glue.h index 754a272dc..b6f9c2cd2 100644 --- a/bsd/sys/dtrace_glue.h +++ b/bsd/sys/dtrace_glue.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include diff --git a/bsd/sys/errno.h b/bsd/sys/errno.h index a11877c52..67aff41a1 100644 --- a/bsd/sys/errno.h +++ b/bsd/sys/errno.h @@ -257,5 +257,6 @@ __END_DECLS /* pseudo-errors returned inside kernel to modify return to process */ #define ERESTART (-1) /* restart syscall */ #define EJUSTRETURN (-2) /* don't modify regs, just return */ +#define ERECYCLE (-5) /* restart lookup under heavy vnode pressure/recycling */ #endif #endif /* _SYS_ERRNO_H_ */ diff --git a/bsd/sys/namei.h b/bsd/sys/namei.h index c0a8368ba..50706beec 100644 --- a/bsd/sys/namei.h +++ b/bsd/sys/namei.h @@ -220,7 +220,7 @@ int relookup(struct vnode *dvp, struct vnode **vpp, */ void cache_purgevfs(mount_t mp); int cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, - vfs_context_t context, int *trailing_slash, int *dp_authorized); + vfs_context_t context, int *trailing_slash, int *dp_authorized, vnode_t last_dp); void vnode_cache_authorized_action(vnode_t vp, vfs_context_t context, kauth_action_t action); void vnode_uncache_authorized_action(vnode_t vp, kauth_action_t action); diff --git a/bsd/sys/reboot.h b/bsd/sys/reboot.h index 83d48fbc4..67b55c786 100644 --- a/bsd/sys/reboot.h +++ b/bsd/sys/reboot.h @@ -88,6 +88,7 @@ #define RB_UNIPROC 0x80 /* don't start slaves */ #define RB_SAFEBOOT 0x100 /* booting safe */ #define RB_UPSDELAY 0x200 /* Delays restart by 5 minutes */ +#define RB_QUICK 0x400 /* quick and ungraceful reboot with file system caches flushed*/ #define RB_PANIC 0 /* reboot due to panic */ #define RB_BOOT 1 /* reboot due to boot() */ diff --git a/bsd/sys/socket.h b/bsd/sys/socket.h index 6d7d93465..4048673b5 100644 --- a/bsd/sys/socket.h +++ b/bsd/sys/socket.h @@ -193,6 +193,7 @@ struct iovec { #define SO_REUSESHAREUID 0x1025 /* APPLE: Allow reuse of port/socket by different userids */ #ifdef __APPLE_API_PRIVATE #define SO_NOTIFYCONFLICT 0x1026 /* APPLE: send notification if there is a bind on a port which is already in use */ +#define SO_UPCALLCLOSEWAIT 0x1027 /* APPLE: block on close until an upcall returns */ #endif #define SO_LINGER_SEC 0x1080 /* linger on close if data present (in seconds) */ #define SO_RESTRICTIONS 0x1081 /* APPLE: deny inbound/outbound/both/flag set */ diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index 988ec8d82..9f55d37a6 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -229,6 +229,7 @@ struct socket { #ifdef __APPLE_API_PRIVATE #define SOF_NOTIFYCONFLICT 0x400 /* notify that a bind was done on a port already in use */ #endif +#define SOF_UPCALLCLOSEWAIT 0x800 /* block on close until an upcall returns */ int so_usecount; /* refcounting of socket use */; int so_retaincnt; u_int32_t so_filteruse; /* usecount for the socket filters */ diff --git a/bsd/vfs/vfs_cache.c b/bsd/vfs/vfs_cache.c index ed6ea3203..c4e93ab8e 100644 --- a/bsd/vfs/vfs_cache.c +++ b/bsd/vfs/vfs_cache.c @@ -827,10 +827,12 @@ boolean_t vnode_cache_is_stale(vnode_t vp) /* * Returns: 0 Success - * ENOENT No such file or directory + * ERECYCLE vnode was recycled from underneath us. Force lookup to be re-driven from namei. + * This errno value should not be seen by anyone outside of the kernel. */ int -cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, vfs_context_t ctx, int *trailing_slash, int *dp_authorized) +cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, + vfs_context_t ctx, int *trailing_slash, int *dp_authorized, vnode_t last_dp) { char *cp; /* pointer into pathname argument */ int vid; @@ -840,11 +842,9 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, kauth_cred_t ucred; boolean_t ttl_enabled = FALSE; struct timeval tv; - mount_t mp; + mount_t mp; unsigned int hash; -#if CONFIG_MACF - int error; -#endif + int error = 0; ucred = vfs_context_ucred(ctx); *trailing_slash = 0; @@ -945,7 +945,7 @@ skiprsrcfork: error = mac_vnode_check_lookup(ctx, dp, cnp); if (error) { name_cache_unlock(); - return (error); + goto errorout; } } #endif /* MAC */ @@ -1052,35 +1052,41 @@ skiprsrcfork: dp = NULLVP; } else { need_dp: - /* + /* * return the last directory we looked at - * with an io reference held + * with an io reference held. If it was the one passed + * in as a result of the last iteration of VNOP_LOOKUP, + * it should already hold an io ref. No need to increase ref. */ - if (dp == ndp->ni_usedvp) { - /* - * if this vnode matches the one passed in via USEDVP - * than this context already holds an io_count... just - * use vnode_get to get an extra ref for lookup to play - * with... can't use the getwithvid variant here because - * it will block behind a vnode_drain which would result - * in a deadlock (since we already own an io_count that the - * vnode_drain is waiting on)... vnode_get grabs the io_count - * immediately w/o waiting... it always succeeds - */ - vnode_get(dp); - } else if ( (vnode_getwithvid(dp, vid)) ) { - /* - * failure indicates the vnode - * changed identity or is being - * TERMINATED... in either case - * punt this lookup. - * - * don't necessarily return ENOENT, though, because - * we really want to go back to disk and make sure it's - * there or not if someone else is changing this - * vnode. - */ - return (ERESTART); + if (last_dp != dp){ + + if (dp == ndp->ni_usedvp) { + /* + * if this vnode matches the one passed in via USEDVP + * than this context already holds an io_count... just + * use vnode_get to get an extra ref for lookup to play + * with... can't use the getwithvid variant here because + * it will block behind a vnode_drain which would result + * in a deadlock (since we already own an io_count that the + * vnode_drain is waiting on)... vnode_get grabs the io_count + * immediately w/o waiting... it always succeeds + */ + vnode_get(dp); + } else if ( (vnode_getwithvid(dp, vid)) ) { + /* + * failure indicates the vnode + * changed identity or is being + * TERMINATED... in either case + * punt this lookup. + * + * don't necessarily return ENOENT, though, because + * we really want to go back to disk and make sure it's + * there or not if someone else is changing this + * vnode. + */ + error = ERECYCLE; + goto errorout; + } } } if (vp != NULLVP) { @@ -1104,7 +1110,22 @@ need_dp: ndp->ni_dvp = dp; ndp->ni_vp = vp; - return (0); +errorout: + /* + * If we came into cache_lookup_path after an iteration of the lookup loop that + * resulted in a call to VNOP_LOOKUP, then VNOP_LOOKUP returned a vnode with a io ref + * on it. It is now the job of cache_lookup_path to drop the ref on this vnode + * when it is no longer needed. If we get to this point, and last_dp is not NULL + * and it is ALSO not the dvp we want to return to caller of this function, it MUST be + * the case that we got to a subsequent path component and this previous vnode is + * no longer needed. We can then drop the io ref on it. + */ + if ((last_dp != NULLVP) && (last_dp != ndp->ni_dvp)){ + vnode_put(last_dp); + } + + //initialized to 0, should be the same if no error cases occurred. + return error; } diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c index 8f7145d19..df54d26b1 100644 --- a/bsd/vfs/vfs_journal.c +++ b/bsd/vfs/vfs_journal.c @@ -2843,8 +2843,8 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void blhdr->checksum = 0; blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); - if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, tr->blhdr->num_blocks * sizeof(struct buf *))) { - panic("can't allocate %lu bytes for bparray\n", tr->blhdr->num_blocks * sizeof(struct buf *)); + if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) { + panic("can't allocate %lu bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *)); } // calculate individual block checksums @@ -2867,7 +2867,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void blhdr->binfo[i].b.bp = bparray[i]; } - kmem_free(kernel_map, (vm_offset_t)bparray, tr->blhdr->num_blocks * sizeof(struct buf *)); + kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *)); if (ret != amt) { printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n", diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index aaabf7bc1..bb8c5dd2b 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -141,7 +141,9 @@ static int vfs_getrealpath(const char * path, char * realpath, size_t bufsize, v * lookup:EROFS * lookup:EACCES * lookup:EPERM - * lookup:??? + * lookup:ERECYCLE vnode was recycled from underneath us in lookup. + * This means we should re-drive lookup from this point. + * lookup: ??? * VNOP_READLINK:??? */ int @@ -150,6 +152,9 @@ namei(struct nameidata *ndp) struct filedesc *fdp; /* pointer to file descriptor state */ char *cp; /* pointer into pathname argument */ struct vnode *dp; /* the directory we are searching */ + struct vnode *usedvp = ndp->ni_dvp; /* store pointer to vp in case we must loop due to + heavy vnode pressure */ + u_long cnpflags = ndp->ni_cnd.cn_flags; /* store in case we have to restore after loop */ uio_t auio; int error; struct componentname *cnp = &ndp->ni_cnd; @@ -170,6 +175,8 @@ namei(struct nameidata *ndp) #endif fdp = p->p_fd; +vnode_recycled: + /* * Get a buffer for the name to be translated, and copy the * name into the buffer. @@ -413,6 +420,14 @@ retry_copy: } cnp->cn_pnbuf = NULL; ndp->ni_vp = NULLVP; + if (error == ERECYCLE){ + /* vnode was recycled underneath us. re-drive lookup to start at + the beginning again, since recycling invalidated last lookup*/ + ndp->ni_cnd.cn_flags = cnpflags; + ndp->ni_dvp = usedvp; + goto vnode_recycled; + } + return (error); } @@ -462,7 +477,7 @@ retry_copy: * ENOTDIR Not a directory * EROFS Read-only file system [CREATE] * EISDIR Is a directory [CREATE] - * cache_lookup_path:ENOENT + * cache_lookup_path:ERECYCLE (vnode was recycled from underneath us, redrive lookup again) * vnode_authorize:EROFS * vnode_authorize:EACCES * vnode_authorize:EPERM @@ -495,6 +510,7 @@ lookup(struct nameidata *ndp) int current_mount_generation = 0; int vbusyflags = 0; int nc_generation = 0; + vnode_t last_dp = NULLVP; /* * Setup: break out flag bits into variables. @@ -526,7 +542,7 @@ lookup(struct nameidata *ndp) dirloop: ndp->ni_vp = NULLVP; - if ( (error = cache_lookup_path(ndp, cnp, dp, ctx, &trailing_slash, &dp_authorized)) ) { + if ( (error = cache_lookup_path(ndp, cnp, dp, ctx, &trailing_slash, &dp_authorized, last_dp)) ) { dp = NULLVP; goto bad; } @@ -865,7 +881,12 @@ nextname: if (*cp == '\0') goto emptyname; - vnode_put(dp); + /* + * cache_lookup_path is now responsible for dropping io ref on dp + * when it is called again in the dirloop. This ensures we hold + * a ref on dp until we complete the next round of lookup. + */ + last_dp = dp; goto dirloop; } diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index be1ba3291..535603224 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -2785,19 +2785,23 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) } static struct klist fs_klist; +lck_grp_t *fs_klist_lck_grp; +lck_mtx_t *fs_klist_lock; void vfs_event_init(void) { - klist_init(&fs_klist); + fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL); + fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL); } void vfs_event_signal(__unused fsid_t *fsid, u_int32_t event, __unused intptr_t data) { - + lck_mtx_lock(fs_klist_lock); KNOTE(&fs_klist, event); + lck_mtx_unlock(fs_klist_lock); } /* @@ -3124,16 +3128,19 @@ static int filt_fsattach(struct knote *kn) { + lck_mtx_lock(fs_klist_lock); kn->kn_flags |= EV_CLEAR; KNOTE_ATTACH(&fs_klist, kn); + lck_mtx_unlock(fs_klist_lock); return (0); } static void filt_fsdetach(struct knote *kn) { - + lck_mtx_lock(fs_klist_lock); KNOTE_DETACH(&fs_klist, kn); + lck_mtx_unlock(fs_klist_lock); } static int @@ -3794,11 +3801,18 @@ vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags) vgone(vp, flags); /* clean and reclaim the vnode */ /* - * give the vnode a new identity so - * that vnode_getwithvid will fail - * on any stale cache accesses + * give the vnode a new identity so that vnode_getwithvid will fail + * on any stale cache accesses... + * grab the list_lock so that if we're in "new_vnode" + * behind the list_lock trying to steal this vnode, the v_id is stable... + * once new_vnode drops the list_lock, it will block trying to take + * the vnode lock until we release it... at that point it will evaluate + * whether the v_vid has changed */ + vnode_list_lock(); vp->v_id++; + vnode_list_unlock(); + if (isfifo) { struct fifoinfo * fip; diff --git a/bsd/vfs/vfs_xattr.c b/bsd/vfs/vfs_xattr.c index e9ef92850..30e62d8ce 100644 --- a/bsd/vfs/vfs_xattr.c +++ b/bsd/vfs/vfs_xattr.c @@ -3041,7 +3041,7 @@ lock_xattrfile(vnode_t xvp, short locktype, vfs_context_t context) lf.l_len = 0; lf.l_type = locktype; /* F_WRLCK or F_RDLCK */ /* Note: id is just a kernel address that's not a proc */ - error = VNOP_ADVLOCK(xvp, (caddr_t)xvp, F_SETLK, &lf, F_FLOCK, context); + error = VNOP_ADVLOCK(xvp, (caddr_t)xvp, F_SETLK, &lf, F_FLOCK|F_WAIT, context); return (error == ENOTSUP ? 0 : error); } diff --git a/bsd/vm/vm_unix.c b/bsd/vm/vm_unix.c index 56c2201dc..54e6d30c6 100644 --- a/bsd/vm/vm_unix.c +++ b/bsd/vm/vm_unix.c @@ -93,10 +93,12 @@ * Sysctl's related to data/stack execution. See osfmk/vm/vm_map.c */ +#ifndef SECURE_KERNEL extern int allow_stack_exec, allow_data_exec; SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW, &allow_stack_exec, 0, ""); SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW, &allow_data_exec, 0, ""); +#endif /* !SECURE_KERNEL */ #if CONFIG_NO_PRINTF_STRINGS void @@ -819,7 +821,7 @@ shared_region_map_np( memory_object_size_t file_size; user_addr_t user_mappings; struct shared_file_mapping_np *mappings; -#define SFM_MAX_STACK 4 +#define SFM_MAX_STACK 8 struct shared_file_mapping_np stack_mappings[SFM_MAX_STACK]; unsigned int mappings_count; vm_size_t mappings_size; diff --git a/config/BSDKernel.exports b/config/BSDKernel.exports index 678fe7d7a..745ad66f2 100644 --- a/config/BSDKernel.exports +++ b/config/BSDKernel.exports @@ -597,6 +597,7 @@ _ubc_isinuse _ubc_msync _ubc_offtoblk _ubc_page_op +_ubc_pages_resident _ubc_range_op _ubc_setcred _ubc_setsize diff --git a/config/IOKit.exports b/config/IOKit.exports index fd0f8de59..f14615395 100644 --- a/config/IOKit.exports +++ b/config/IOKit.exports @@ -119,7 +119,6 @@ __Z17IODTMapInterruptsP15IORegistryEntry __Z17IODeviceTreeAllocPv __Z17IOServiceOrderingPK15OSMetaClassBaseS1_Pv __Z18IODTCompareNubNamePK15IORegistryEntryP8OSStringPS3_ -__Z19IODTMapOneInterruptP15IORegistryEntryPmPP6OSDataPPK8OSSymbol __Z19printDictionaryKeysP12OSDictionaryPc __Z19tellAppWithResponseP8OSObjectPv __Z20IODTMakeNVDescriptorP15IORegistryEntryP17IONVRAMDescriptor @@ -127,7 +126,6 @@ __Z20IODTMatchNubWithKeysP15IORegistryEntryPKc __Z21IODTResolveAddressingP15IORegistryEntryPKcP14IODeviceMemory __Z22IODTResolveAddressCellP15IORegistryEntryPmS1_S1_ __Z22tellClientWithResponseP8OSObjectPv -__Z23IODTFindInterruptParentP15IORegistryEntry __Z23IODTFindMatchingEntriesP15IORegistryEntrymPKc __Z24broadcast_aggressivenessP8OSObjectPvS1_S1_S1_ __Z26serializedAllowPowerChangeP8OSObjectPvS1_S1_S1_ @@ -561,6 +559,8 @@ __ZN14IOPMrootDomain23requestPowerDomainStateEmP17IOPowerConnectionm __ZN14IOPMrootDomain23setQuickSpinDownTimeoutEv __ZN14IOPMrootDomain24displayWranglerPublishedEPvS0_P9IOService __ZN14IOPMrootDomain24receivePowerNotificationEm +__ZN14IOPMrootDomain24systemPowerEventOccurredEPK8OSSymbolP8OSObject +__ZN14IOPMrootDomain24systemPowerEventOccurredEPK8OSSymbolj __ZN14IOPMrootDomain25announcePowerSourceChangeEv __ZN14IOPMrootDomain26handleSleepTimerExpirationEv __ZN14IOPMrootDomain26restoreUserSpinDownTimeoutEv diff --git a/config/MasterVersion b/config/MasterVersion index dfaa71992..6f5ee0f98 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -9.1.0 +9.2.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/System6.0.exports b/config/System6.0.exports index 71c1ae0fd..efaa7c605 100644 --- a/config/System6.0.exports +++ b/config/System6.0.exports @@ -186,7 +186,6 @@ __Z17IODTMapInterruptsP15IORegistryEntry __Z17IODeviceTreeAllocPv __Z17IOServiceOrderingPK15OSMetaClassBaseS1_Pv __Z18IODTCompareNubNamePK15IORegistryEntryP8OSStringPS3_ -__Z19IODTMapOneInterruptP15IORegistryEntryPmPP6OSDataPPK8OSSymbol __Z19printDictionaryKeysP12OSDictionaryPc __Z19tellAppWithResponseP8OSObjectPv __Z20IODTMakeNVDescriptorP15IORegistryEntryP17IONVRAMDescriptor @@ -194,7 +193,6 @@ __Z20IODTMatchNubWithKeysP15IORegistryEntryPKc __Z21IODTResolveAddressingP15IORegistryEntryPKcP14IODeviceMemory __Z22IODTResolveAddressCellP15IORegistryEntryPmS1_S1_ __Z22tellClientWithResponseP8OSObjectPv -__Z23IODTFindInterruptParentP15IORegistryEntry __Z23IODTFindMatchingEntriesP15IORegistryEntrymPKc __Z24broadcast_aggressivenessP8OSObjectPvS1_S1_S1_ __Z26serializedAllowPowerChangeP8OSObjectPvS1_S1_S1_ diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h index 362c5ed31..3e447c36c 100644 --- a/iokit/IOKit/pwr_mgt/IOPM.h +++ b/iokit/IOKit/pwr_mgt/IOPM.h @@ -252,13 +252,24 @@ enum { kInflowForciblyEnabledBit = (1 << 0) }; +/* kIOPMMessageInternalBatteryFullyDischarged + * The battery has drained completely to its "Fully Discharged" state. + */ #define kIOPMMessageInternalBatteryFullyDischarged \ iokit_family_msg(sub_iokit_powermanagement, 0x120) +/* kIOPMMessageSystemPowerEventOccurred + * Some major system thermal property has changed, and interested clients may + * modify their behavior. + */ +#define kIOPMMessageSystemPowerEventOccurred \ + iokit_family_msg(sub_iokit_powermanagement, 0x130) + /******************************************************************************* * * Power commands issued to root domain + * Use with IOPMrootDomain::receivePowerNotification() * * These commands are issued from system drivers only: * ApplePMU, AppleSMU, IOGraphics, AppleACPIFamily @@ -278,6 +289,7 @@ enum { kIOPMClamshellOpened = (1<<10) // clamshell was opened }; + /******************************************************************************* * * Power Management Return Codes @@ -378,6 +390,76 @@ enum { #define kIOPMPSPostDishargeWaitSecondsKey "PostDischargeWaitSeconds" +/* CPU Power Management status keys + * Pass as arguments to IOPMrootDomain::systemPowerEventOccurred + * Or as arguments to IOPMSystemPowerEventOccurred() + * Or to decode the dictionary obtained from IOPMCopyCPUPowerStatus() + * These keys reflect restrictions placed on the CPU by the system + * to bring the CPU's power consumption within allowable thermal and + * power constraints. + */ + + +/* kIOPMGraphicsPowerLimitsKey + * The key representing the dictionary of graphics power limits. + * The dictionary contains the other kIOPMCPUPower keys & their associated + * values (e.g. Speed limit, Processor Count, and Schedule limits). + */ +#define kIOPMGraphicsPowerLimitsKey "Graphics_Power_Limits" + +/* kIOPMGraphicsPowerLimitPerformanceKey + * The key representing the percent of overall performance made available + * by the graphics chip as a percentage (integer 0 - 100). + */ +#define kIOPMGraphicsPowerLimitPerformanceKey "Graphics_Power_Performance" + + + +/* kIOPMCPUPowerLimitsKey + * The key representing the dictionary of CPU Power Limits. + * The dictionary contains the other kIOPMCPUPower keys & their associated + * values (e.g. Speed limit, Processor Count, and Schedule limits). + */ +#define kIOPMCPUPowerLimitsKey "CPU_Power_Limits" + +/* kIOPMCPUPowerLimitProcessorSpeedKey defines the speed & voltage limits placed + * on the CPU. + * Represented as a percentage (0-100) of maximum CPU speed. + */ +#define kIOPMCPUPowerLimitProcessorSpeedKey "CPU_Speed_Limit" + +/* kIOPMCPUPowerLimitProcessorCountKey reflects how many, if any, CPUs have been + * taken offline. Represented as an integer number of CPUs (0 - Max CPUs). + */ +#define kIOPMCPUPowerLimitProcessorCountKey "CPU_Available_CPUs" + +/* kIOPMCPUPowerLimitSchedulerTimeKey represents the percentage (0-100) of CPU time + * available. 100% at normal operation. The OS may limit this time for a percentage + * less than 100%. + */ +#define kIOPMCPUPowerLimitSchedulerTimeKey "CPU_Scheduler_Limit" + + +/* Thermal Level Warning Key + * Indicates the thermal constraints placed on the system. This value may + * cause clients to action to consume fewer system resources. + * The value associated with this warning is defined by the platform. + */ +#define kIOPMThermalLevelWarningKey "Thermal_Level_Warning" + +/* Thermal Warning Level values + * kIOPMThermalWarningLevelNormal - under normal operating conditions + * kIOPMThermalWarningLevelDanger - thermal pressure may cause system slowdown + * kIOPMThermalWarningLevelCrisis - thermal conditions may cause imminent shutdown + * + * The platform may define additional thermal levels if necessary. + */ +enum { + kIOPMThermalWarningLevelNormal = 0, + kIOPMThermalWarningLevelDanger = 5, + kIOPMThermalWarningLevelCrisis = 10 +}; + // PM Settings Controller setting types // Settings types used primarily with: diff --git a/iokit/IOKit/pwr_mgt/RootDomain.h b/iokit/IOKit/pwr_mgt/RootDomain.h index 2605169c3..c528b8c3e 100644 --- a/iokit/IOKit/pwr_mgt/RootDomain.h +++ b/iokit/IOKit/pwr_mgt/RootDomain.h @@ -41,12 +41,15 @@ enum { kPCICantSleep = 0x00000004 }; + + /* *IOPMrootDomain registry property keys */ #define kRootDomainSupportedFeatures "Supported Features" #define kRootDomainSleepReasonKey "Last Sleep Reason" #define kRootDomainSleepOptionsKey "Last Sleep Options" +#define kIOPMRootDomainPowerStatusKey "Power Status" /* * Possible sleep reasons found under kRootDomainSleepReasonsKey @@ -115,6 +118,22 @@ public: virtual IOReturn setProperties ( OSObject * ); IOReturn shutdownSystem ( void ); IOReturn restartSystem ( void ); + +/*! @function systemPowerEventOccurred + @abstract Other drivers may inform IOPMrootDomain of system PM events + @discussion systemPowerEventOccurred is a richer alternative to receivePowerNotification() + Only Apple-owned kexts should have reason to call systemPowerEventOccurred. + @param event An OSSymbol describing the type of power event. + @param value A 32-bit integer value associated with the event. + @param shouldUpdate indicates whether the root domain should send a notification + to interested parties. Pass false if you're calling systemPowerEventOccurred + several times in succession; and pass true only on the last invocatino. + @result kIOReturnSuccess on success */ + IOReturn systemPowerEventOccurred(const OSSymbol *event, + uint32_t intValue); + IOReturn systemPowerEventOccurred(const OSSymbol *event, + OSObject *value); + virtual IOReturn receivePowerNotification (UInt32 msg); virtual void setSleepSupported( IOOptionBits flags ); virtual IOOptionBits getSleepSupported(); diff --git a/iokit/Kernel/IOBufferMemoryDescriptor.cpp b/iokit/Kernel/IOBufferMemoryDescriptor.cpp index efe64454d..5fbfc6715 100644 --- a/iokit/Kernel/IOBufferMemoryDescriptor.cpp +++ b/iokit/Kernel/IOBufferMemoryDescriptor.cpp @@ -500,7 +500,7 @@ void IOBufferMemoryDescriptor::free() IOOptionBits options = _options; vm_size_t size = _capacity; void * buffer = _buffer; - IOVirtualAddress source = _ranges.v64->address; + mach_vm_address_t source = (_ranges.v) ? _ranges.v64->address : 0; IOMemoryMap * map = 0; vm_offset_t alignment = _alignment; @@ -524,7 +524,7 @@ void IOBufferMemoryDescriptor::free() else if (buffer) { if (kIOMemoryTypePhysical64 == (flags & kIOMemoryTypeMask)) - IOFreePhysical((mach_vm_address_t) source, size); + IOFreePhysical(source, size); else if (options & kIOMemoryPhysicallyContiguous) IOKernelFreeContiguous((mach_vm_address_t) buffer, size); else if (alignment > 1) diff --git a/iokit/Kernel/IODMACommand.cpp b/iokit/Kernel/IODMACommand.cpp index 9aece35ff..75d751afe 100644 --- a/iokit/Kernel/IODMACommand.cpp +++ b/iokit/Kernel/IODMACommand.cpp @@ -263,6 +263,7 @@ IODMACommand::setMemoryDescriptor(const IOMemoryDescriptor *mem, bool autoPrepar else fInternalState->fCheckAddressing = (fNumAddressBits && (highPage >= (1UL << (fNumAddressBits - PAGE_SHIFT)))); + fInternalState->fNewMD = true; mem->retain(); fMemory = mem; @@ -857,10 +858,11 @@ IODMACommand::genIOVMSegments(InternalSegmentFunction outSegFunc, if (offset >= memLength) return kIOReturnOverrun; - if ((offset == internalState->fPreparedOffset) || (offset != state->fOffset)) { + if ((offset == internalState->fPreparedOffset) || (offset != state->fOffset) || internalState->fNewMD) { state->fOffset = 0; state->fIOVMAddr = 0; internalState->fNextRemapIndex = 0; + internalState->fNewMD = false; state->fMapped = (IS_MAPPED(fMappingOptions) && fMapper); mdOp = kIOMDFirstSegment; }; diff --git a/iokit/Kernel/IODeviceTreeSupport.cpp b/iokit/Kernel/IODeviceTreeSupport.cpp index 4b6a1fdf2..381022c56 100644 --- a/iokit/Kernel/IODeviceTreeSupport.cpp +++ b/iokit/Kernel/IODeviceTreeSupport.cpp @@ -435,15 +435,21 @@ static bool GetUInt32( IORegistryEntry * regEntry, const OSSymbol * name, return( false ); } -IORegistryEntry * IODTFindInterruptParent( IORegistryEntry * regEntry ) +static IORegistryEntry * IODTFindInterruptParent( IORegistryEntry * regEntry, IOItemCount index ) { IORegistryEntry * parent; UInt32 phandle; + OSData * data; + unsigned int len; - if( GetUInt32( regEntry, gIODTInterruptParentKey, &phandle)) - parent = FindPHandle( phandle ); + if( (data = OSDynamicCast( OSData, regEntry->getProperty( gIODTInterruptParentKey ))) + && (sizeof(UInt32) <= (len = data->getLength()))) { + if (((index + 1) * sizeof(UInt32)) > len) + index = 0; + phandle = ((UInt32 *) data->getBytesNoCopy())[index]; + parent = FindPHandle( phandle ); - else if( 0 == regEntry->getProperty( "interrupt-controller")) + } else if( 0 == regEntry->getProperty( "interrupt-controller")) parent = regEntry->getParentEntry( gIODTPlane); else parent = 0; @@ -481,8 +487,8 @@ static void IODTGetICellCounts( IORegistryEntry * regEntry, *aCellCount = 0; } -UInt32 IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, - OSData ** spec, const OSSymbol ** controller ) +static UInt32 IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index, + OSData ** spec, const OSSymbol ** controller ) { IORegistryEntry *parent = 0; OSData *data; @@ -494,7 +500,7 @@ UInt32 IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 i, original_icells; bool cmp, ok = false; - parent = IODTFindInterruptParent( regEntry ); + parent = IODTFindInterruptParent( regEntry, index ); IODTGetICellCounts( parent, &icells, &acells ); addrCmp = 0; if( acells) { @@ -640,11 +646,12 @@ static bool IODTMapInterruptsSharing( IORegistryEntry * regEntry, OSDictionary * OSData * local2; UInt32 * localBits; UInt32 * localEnd; + IOItemCount index; OSData * map; OSObject * oneMap; OSArray * mapped; OSArray * controllerInts; - const OSSymbol * controller; + const OSSymbol * controller = 0; OSArray * controllers; UInt32 skip = 1; bool ok, nw; @@ -666,6 +673,7 @@ static bool IODTMapInterruptsSharing( IORegistryEntry * regEntry, OSDictionary * localBits = (UInt32 *) local->getBytesNoCopy(); localEnd = localBits + (local->getLength() / sizeof(UInt32)); + index = 0; mapped = OSArray::withCapacity( 1 ); controllers = OSArray::withCapacity( 1 ); @@ -673,7 +681,7 @@ static bool IODTMapInterruptsSharing( IORegistryEntry * regEntry, OSDictionary * if( ok) do { if( nw) { - skip = IODTMapOneInterrupt( regEntry, localBits, &map, &controller ); + skip = IODTMapOneInterrupt( regEntry, localBits, index, &map, &controller ); if( 0 == skip) { IOLog("%s: error mapping interrupt[%d]\n", regEntry->getName(), mapped->getCount()); @@ -686,6 +694,7 @@ static bool IODTMapInterruptsSharing( IORegistryEntry * regEntry, OSDictionary * controller->retain(); } + index++; localBits += skip; mapped->setObject( map ); controllers->setObject( controller ); diff --git a/iokit/Kernel/IOHibernateIO.cpp b/iokit/Kernel/IOHibernateIO.cpp index 030368a72..ae66cb9b8 100644 --- a/iokit/Kernel/IOHibernateIO.cpp +++ b/iokit/Kernel/IOHibernateIO.cpp @@ -1598,7 +1598,10 @@ IOHibernateSystemWake(void) const OSSymbol * sym = OSSymbol::withCStringNoCopy(kIOHibernateRTCVariablesKey); if (sym) { - gIOOptionsEntry->removeProperty(sym); + if (gIOOptionsEntry->getProperty(sym)) { + gIOOptionsEntry->removeProperty(sym); + gIOOptionsEntry->sync(); + } sym->release(); } } diff --git a/iokit/Kernel/IOKitKernelInternal.h b/iokit/Kernel/IOKitKernelInternal.h index afa64a600..a21ff0031 100644 --- a/iokit/Kernel/IOKitKernelInternal.h +++ b/iokit/Kernel/IOKitKernelInternal.h @@ -113,7 +113,7 @@ struct IODMACommandInternal UInt8 fCopyContig; UInt8 fPrepared; UInt8 fDoubleBuffer; - UInt8 __pad[1]; + UInt8 fNewMD; ppnum_t fCopyPageAlloc; ppnum_t fCopyPageCount; diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp index 3c0b8f7e1..43321aac1 100644 --- a/iokit/Kernel/IOMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMemoryDescriptor.cpp @@ -1017,6 +1017,8 @@ void IOGeneralMemoryDescriptor::free() IODelete(_ranges.v64, IOAddressRange, _rangesCount); else IODelete(_ranges.v, IOVirtualRange, _rangesCount); + + _ranges.v = NULL; } if (reserved && reserved->devicePager) diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index 9af5919f4..81568ee1e 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -1076,6 +1076,9 @@ void IOPMrootDomain::powerChangeDone ( unsigned long previousState ) // re-enable this timer for next sleep idleSleepPending = false; gSleepOrShutdownPending = 0; + + // Invalidate prior activity tickles to allow wake from doze. + if (wrangler) wrangler->changePowerStateTo(0); break; case RESTART_STATE: @@ -1653,12 +1656,87 @@ void IOPMrootDomain::informCPUStateChange( #endif __i386__ } +//****************************************************************************** +// systemPowerEventOccurred +// +// The power controller is notifying us of a hardware-related power management +// event that we must handle. +// +// systemPowerEventOccurred covers the same functionality that receivePowerNotification +// does; it simply provides a richer API for conveying more information. +//****************************************************************************** +IOReturn IOPMrootDomain::systemPowerEventOccurred( + const OSSymbol *event, + uint32_t intValue) +{ + IOReturn attempt = kIOReturnSuccess; + OSNumber *newNumber = NULL; + + if (!event) + return kIOReturnBadArgument; + + newNumber = OSNumber::withNumber(intValue, 8*sizeof(intValue)); + if (!newNumber) + return kIOReturnInternalError; + + attempt = systemPowerEventOccurred(event, (OSObject *)newNumber); + + newNumber->release(); + + return attempt; +} + +IOReturn IOPMrootDomain::systemPowerEventOccurred( + const OSSymbol *event, + OSObject *value) +{ + OSDictionary *thermalsDict = NULL; + bool shouldUpdate = true; + + if (!event || !value) + return kIOReturnBadArgument; + + // LOCK + // We reuse featuresDict Lock because it already exists and guards + // the very infrequently used publish/remove feature mechanism; so there's zero rsk + // of stepping on that lock. + if (featuresDictLock) IOLockLock(featuresDictLock); + + thermalsDict = (OSDictionary *)getProperty(kIOPMRootDomainPowerStatusKey); + + if (thermalsDict && OSDynamicCast(OSDictionary, thermalsDict)) { + thermalsDict = OSDictionary::withDictionary(thermalsDict); + } else { + thermalsDict = OSDictionary::withCapacity(1); + } + + if (!thermalsDict) { + shouldUpdate = false; + goto exit; + } + + thermalsDict->setObject (event, value); + + setProperty (kIOPMRootDomainPowerStatusKey, thermalsDict); + + thermalsDict->release(); + +exit: + // UNLOCK + if (featuresDictLock) IOLockUnlock(featuresDictLock); + + if (shouldUpdate) + messageClients (kIOPMMessageSystemPowerEventOccurred, (void *)NULL); + + return kIOReturnSuccess; +} + //****************************************************************************** // receivePowerNotification // // The power controller is notifying us of a hardware-related power management -// event that we must handle. This is a result of an 'environment' interrupt from +// event that we must handle. This may be a result of an 'environment' interrupt from // the power mgt micro. //****************************************************************************** diff --git a/iokit/Kernel/IOPlatformExpert.cpp b/iokit/Kernel/IOPlatformExpert.cpp index 03d349e4f..1b53461ec 100644 --- a/iokit/Kernel/IOPlatformExpert.cpp +++ b/iokit/Kernel/IOPlatformExpert.cpp @@ -46,6 +46,7 @@ #include #include +#include extern "C" { #include @@ -858,29 +859,57 @@ void PESetGMTTimeOfDay(long secs) void IOPlatformExpert::registerNVRAMController(IONVRAMController * caller) { OSData * data; - IORegistryEntry * nvram; - OSString * string; + IORegistryEntry * entry; + OSString * string = 0; + char uuid[ 36 + 1 ]; - nvram = IORegistryEntry::fromPath( "/options", gIODTPlane ); - if ( nvram ) + entry = IORegistryEntry::fromPath( "/efi/platform", gIODTPlane ); + if ( entry ) { - data = OSDynamicCast( OSData, nvram->getProperty( "platform-uuid" ) ); - if ( data && data->getLength( ) == sizeof( uuid_t ) ) + data = OSDynamicCast( OSData, entry->getProperty( "system-id" ) ); + if ( data && data->getLength( ) == 16 ) { - char uuid[ 36 + 1 ]; - uuid_unparse( ( UInt8 * ) data->getBytesNoCopy( ), uuid ); + SHA1_CTX context; + uint8_t digest[ SHA_DIGEST_LENGTH ]; + const uuid_t space = { 0x2A, 0x06, 0x19, 0x90, 0xD3, 0x8D, 0x44, 0x40, 0xA1, 0x39, 0xC4, 0x97, 0x70, 0x37, 0x65, 0xAC }; + SHA1Init( &context ); + SHA1Update( &context, space, sizeof( space ) ); + SHA1Update( &context, data->getBytesNoCopy( ), data->getLength( ) ); + SHA1Final( digest, &context ); + + digest[ 6 ] = ( digest[ 6 ] & 0x0F ) | 0x50; + digest[ 8 ] = ( digest[ 8 ] & 0x3F ) | 0x80; + + uuid_unparse( digest, uuid ); string = OSString::withCString( uuid ); - if ( string ) - { - getProvider( )->setProperty( kIOPlatformUUIDKey, string ); - publishResource( kIOPlatformUUIDKey, string ); + } - string->release( ); + entry->release( ); + } + + if ( string == 0 ) + { + entry = IORegistryEntry::fromPath( "/options", gIODTPlane ); + if ( entry ) + { + data = OSDynamicCast( OSData, entry->getProperty( "platform-uuid" ) ); + if ( data && data->getLength( ) == sizeof( uuid_t ) ) + { + uuid_unparse( ( uint8_t * ) data->getBytesNoCopy( ), uuid ); + string = OSString::withCString( uuid ); } + + entry->release( ); } + } + + if ( string ) + { + getProvider( )->setProperty( kIOPlatformUUIDKey, string ); + publishResource( kIOPlatformUUIDKey, string ); - nvram->release( ); + string->release( ); } publishResource("IONVRAM"); @@ -1281,7 +1310,7 @@ IOReturn IOPlatformExpertDevice::setProperties( OSObject * properties ) object = dictionary->getObject( kIOPlatformUUIDKey ); if ( object ) { - IORegistryEntry * nvram; + IORegistryEntry * entry; OSString * string; uuid_t uuid; @@ -1294,11 +1323,11 @@ IOReturn IOPlatformExpertDevice::setProperties( OSObject * properties ) status = uuid_parse( string->getCStringNoCopy( ), uuid ); if ( status != 0 ) return kIOReturnBadArgument; - nvram = IORegistryEntry::fromPath( "/options", gIODTPlane ); - if ( nvram ) + entry = IORegistryEntry::fromPath( "/options", gIODTPlane ); + if ( entry ) { - nvram->setProperty( "platform-uuid", uuid, sizeof( uuid_t ) ); - nvram->release( ); + entry->setProperty( "platform-uuid", uuid, sizeof( uuid_t ) ); + entry->release( ); } setProperty( kIOPlatformUUIDKey, string ); diff --git a/iokit/bsddev/IOKitBSDInit.cpp b/iokit/bsddev/IOKitBSDInit.cpp index 7621a257f..895f27b98 100644 --- a/iokit/bsddev/IOKitBSDInit.cpp +++ b/iokit/bsddev/IOKitBSDInit.cpp @@ -43,6 +43,7 @@ extern "C" { extern dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys); extern dev_t mdevlookup(int devid); +extern void mdevremoveall(void); kern_return_t IOKitBSDInit( void ) @@ -776,14 +777,19 @@ iofrootx: void IOSecureBSDRoot(const char * rootName) { #if CONFIG_EMBEDDED + IOReturn result; IOPlatformExpert *pe; - const OSSymbol *functionName = OSSymbol::withCStringNoCopy("SecureRootName"); + const OSSymbol *functionName = OSSymbol::withCStringNoCopy("SecureRootName"); while ((pe = IOService::getPlatform()) == 0) IOSleep(1 * 1000); - pe->callPlatformFunction(functionName, false, (void *)rootName, (void *)0, (void *)0, (void *)0); + // Returns kIOReturnNotPrivileged is the root device is not secure. + // Returns kIOReturnUnsupported if "SecureRootName" is not implemented. + result = pe->callPlatformFunction(functionName, false, (void *)rootName, (void *)0, (void *)0, (void *)0); functionName->release(); + + if (result == kIOReturnNotPrivileged) mdevremoveall(); #endif } diff --git a/iokit/conf/MASTER b/iokit/conf/MASTER index bf820b20e..ae3f0e88b 100644 --- a/iokit/conf/MASTER +++ b/iokit/conf/MASTER @@ -63,7 +63,6 @@ options IOKITCPP # C++ implementation # options KDEBUG # kernel tracing # options NETWORKING # kernel networking # options CRYPTO # want crypto code # -options KPIDIRECT # direct access # options CONFIG_DTRACE # enable dtrace # #makeoptions LIBDRIVER = "libDriver_kern.o" # diff --git a/iokit/conf/files b/iokit/conf/files index dea0585dc..3b87d080a 100644 --- a/iokit/conf/files +++ b/iokit/conf/files @@ -3,7 +3,6 @@ OPTIONS/iokitcpp optional iokitcpp OPTIONS/kdebug optional kdebug OPTIONS/networking optional networking -OPTIONS/kpidirect optional kpidirect OPTIONS/hibernation optional hibernation OPTIONS/crypto optional crypto OPTIONS/config_dtrace optional config_dtrace diff --git a/kgmacros b/kgmacros index f0f7e3df4..8da2cc9b2 100644 --- a/kgmacros +++ b/kgmacros @@ -5383,3 +5383,169 @@ document showMCAstate Syntax: showMCAstate | Print machine-check register state after MC exception. end + +define _pt_step + # + # Step to lower-level page table and print attributes + # $kgm_pt_paddr: current page table entry physical address + # $kgm_pt_index: current page table entry index (0..511) + # returns + # $kgm_pt_paddr: next level page table entry physical address + # or null if invalid + # For $kgm_pt_verbose = 0: print nothing + # 1: print basic information + # 2: print basic information and hex table dump + # The trickery with kdp_src_high32 is required for accesses above 4GB. + # + set $kgm_entryp = $kgm_pt_paddr + 8*$kgm_pt_index + set kdp_src_high32 = $kgm_pt_paddr >> 32 + set kdp_trans_off = 1 + set $entry = *(pt_entry_t *)($kgm_entryp & 0x0ffffffffULL) + if $kgm_pt_verbose == 2 + x/512g ($kgm_pt_paddr & 0x0ffffffffULL) + end + set kdp_trans_off = 0 + set kdp_src_high32 = 0 + set $kgm_paddr_mask = ~((0xffffULL<<48) | 0xfffULL) + if $kgm_pt_verbose == 0 + if $entry & (0x1 << 0) + set $kgm_pt_paddr = $entry & $kgm_paddr_mask + else + set $kgm_pt_paddr = 0 + end + else + printf "0x%016llx:\n\t0x%016llx\n\t", $kgm_entryp, $entry + if $entry & (0x1 << 0) + printf "valid" + set $kgm_pt_paddr = $entry & $kgm_paddr_mask + else + printf "invalid" + set $kgm_pt_paddr = 0 + end + if $entry & (0x1 << 1) + printf " writeable" + else + printf " read-only" + end + if $entry & (0x1 << 2) + printf " user" + else + printf " supervisor" + end + if $entry & (0x1 << 3) + printf " PWT" + end + if $entry & (0x1 << 4) + printf " PCD" + end + if $entry & (0x1 << 5) + printf " accessed" + end + if $entry & (0x1 << 6) + printf " dirty" + end + if $entry & (0x1 << 7) + printf " PAT" + end + if $entry & (0x1 << 8) + printf " global" + end + if $entry & (0x3 << 9) + printf " avail:0x%x", ($entry >> 9) & 0x3 + end + if $entry & (0x1 << 63) + printf " noexec" + end + printf "\n" + end +end + +define _pmap_walk + set $kgm_pmap = (pmap_t) $arg0 + set $kgm_vaddr = $arg1 + set $kgm_pt_paddr = $kgm_pmap->pm_cr3 + if $kgm_pt_paddr && cpu_64bit + set $kgm_pt_index = ($kgm_vaddr >> 39) & 0x1ffULL + if $kgm_pt_verbose + printf "pml4 (index %d):\n", $kgm_pt_index + end + _pt_step + end + if $kgm_pt_paddr + set $kgm_pt_index = ($kgm_vaddr >> 30) & 0x1ffULL + if $kgm_pt_verbose + printf "pdpt (index %d):\n", $kgm_pt_index + end + _pt_step + end + if $kgm_pt_paddr + set $kgm_pt_index = ($kgm_vaddr >> 21) & 0x1ffULL + if $kgm_pt_verbose + printf "pdt (index %d):\n", $kgm_pt_index + end + _pt_step + end + if $kgm_pt_paddr + set $kgm_pt_index = ($kgm_vaddr >> 12) & 0x1ffULL + if $kgm_pt_verbose + printf "pt (index %d):\n", $kgm_pt_index + end + _pt_step + end + if $kgm_pt_paddr + set $kgm_paddr = $kgm_pt_paddr + ($kgm_vaddr & 0xfffULL) + set kdp_trans_off = 1 + set kdp_src_high32 = $kgm_paddr >> 32 + set $kgm_value = *($kgm_paddr & 0x0ffffffffULL) + set kdp_trans_off = 0 + set kdp_src_high32 = 0 + printf "phys 0x%016llx: 0x%08x\n", $kgm_paddr, $kgm_value + else + set $kgm_paddr = 0 + printf "(no translation)\n" + end +end + +define pmap_walk + if $kgm_mtype != 7 + printf "Not available for current architecture.\n" + else + if $argc != 2 + printf "pmap_walk \n" + else + if !$kgm_pt_verbose + set $kgm_pt_verbose = 1 + else + if $kgm_pt_verbose != 2 + set $kgm_pt_verbose = 1 + end + end + _pmap_walk $arg0 $arg1 + end + end +end + +document pmap_walk +Syntax: (gdb) pmap_walk +| Perform a page-table walk in for . +| Set $kgm_pt_verbose=2 for full hex dump of page tables. +end + +define pmap_vtop + if $kgm_mtype != 7 + printf "Not available for current architecture.\n" + else + if $argc != 2 + printf "pmap_vtop \n" + else + set $kgm_pt_verbose = 0 + _pmap_walk $arg0 $arg1 + end + end +end + +document pmap_vtop +Syntax: (gdb) pmap_vtop +| For page-tables in translate to physical address. +end + diff --git a/libsyscall/Makefile b/libsyscall/Makefile index 9b193ef21..ab642795b 100644 --- a/libsyscall/Makefile +++ b/libsyscall/Makefile @@ -43,7 +43,7 @@ MAKEOBJDIR ?= ${OBJROOT} # add version string SRCS += libsyscall_version.c libsyscall_version.c: - /Developer/Makefiles/bin/version.pl Libsyscall > $@ + ${NEXT_ROOT}/Developer/Makefiles/bin/version.pl Libsyscall > $@ CFLAGS += -I${SYMROOT} .include "${.CURDIR}/Makefile.inc" diff --git a/libsyscall/Makefile.xbs b/libsyscall/Makefile.xbs index 62721adbd..4b3d8d543 100644 --- a/libsyscall/Makefile.xbs +++ b/libsyscall/Makefile.xbs @@ -94,9 +94,15 @@ PRIVHDRSPPC = ${PRIVHDRS}/architecture/ppc KERNELFRAMEWORK = ${DESTDIR}/System/Library/Frameworks/Kernel.framework PRIVKERNELHDRS = ${KERNELFRAMEWORK}/Versions/A/PrivateHeaders +.if ${MACHINE_ARCH} == armv6 +ARCHDIR = arm +.else +ARCHDIR = ${MACHINE_ARCH} +.endif + installhdrs-md: gen_md_mig_defs - mkdir -p ${INCDIR}/mach/${MACHINE_ARCH} - ${INSTALL} -o 0 -c -m 444 ${MD_MIGHDRS} ${INCDIR}/mach/${MACHINE_ARCH} + mkdir -p ${INCDIR}/mach/${ARCHDIR} + ${INSTALL} -o 0 -c -m 444 ${MD_MIGHDRS} ${INCDIR}/mach/${ARCHDIR} mkdir -p ${PRIVHDRSPPC} ${INSTALL} -c -m 444 ${PRIVHDRSPPCHDRS} ${PRIVHDRSPPC} diff --git a/libsyscall/create-syscalls.pl b/libsyscall/create-syscalls.pl index f54f344ed..83bf17c1f 100755 --- a/libsyscall/create-syscalls.pl +++ b/libsyscall/create-syscalls.pl @@ -102,7 +102,7 @@ my %TypeBytes = ( ########################################################################## # Make a __xxx.s file: if it exists in the $CustomDir, just copy it, otherwise -# create one. We define the macro __SYSCALL_I386_ARG_BYTES so that SYS.h could +# create one. We define the macro __SYSCALL_32BIT_ARG_BYTES so that SYS.h could # use that to define __SYSCALL dependent on the arguments' total size. ########################################################################## sub make_s { @@ -119,7 +119,7 @@ sub make_s { } else { my $f = IO::File->new($path, 'w'); die "$MyName: $path: $!\n" unless defined($f); - print $f "#define __SYSCALL_I386_ARG_BYTES $bytes\n\n"; + print $f "#define __SYSCALL_32BIT_ARG_BYTES $bytes\n\n"; print $f "#include \"SYS.h\"\n\n"; print $f "__SYSCALL($pseudo, $name, $args)\n"; print "Creating $path\n"; diff --git a/libsyscall/custom/SYS.h b/libsyscall/custom/SYS.h index 53039d9e3..af9074020 100644 --- a/libsyscall/custom/SYS.h +++ b/libsyscall/custom/SYS.h @@ -138,14 +138,14 @@ LEAF(_##name, 0) ;\ BRANCH_EXTERN(cerror) ;\ 2: -#if defined(__SYSCALL_I386_ARG_BYTES) && ((__SYSCALL_I386_ARG_BYTES >= 4) && (__SYSCALL_I386_ARG_BYTES <= 20)) +#if defined(__SYSCALL_32BIT_ARG_BYTES) && ((__SYSCALL_32BIT_ARG_BYTES >= 4) && (__SYSCALL_32BIT_ARG_BYTES <= 20)) #define UNIX_SYSCALL_NONAME(name, nargs) \ - movl $(SYS_##name | (__SYSCALL_I386_ARG_BYTES << I386_SYSCALL_ARG_BYTES_SHIFT)), %eax ;\ + movl $(SYS_##name | (__SYSCALL_32BIT_ARG_BYTES << I386_SYSCALL_ARG_BYTES_SHIFT)), %eax ;\ UNIX_SYSCALL_SYSENTER ;\ jnb 2f ;\ BRANCH_EXTERN(cerror) ;\ 2: -#else /* __SYSCALL_I386_ARG_BYTES < 4 || > 20 */ +#else /* __SYSCALL_32BIT_ARG_BYTES < 4 || > 20 */ #define UNIX_SYSCALL_NONAME(name, nargs) \ .globl cerror ;\ movl $ SYS_##name, %eax ;\ diff --git a/libsyscall/mach/Makefile.inc b/libsyscall/mach/Makefile.inc index 4efe509ce..40048e71e 100644 --- a/libsyscall/mach/Makefile.inc +++ b/libsyscall/mach/Makefile.inc @@ -1,6 +1,11 @@ # machine-dependent mach sources -.if exists(${.CURDIR}/mach/${MACHINE_ARCH}/Makefile.inc) -.include "${.CURDIR}/mach/${MACHINE_ARCH}/Makefile.inc" +.if ${MACHINE_ARCH} == armv6 +ARCHDIR = arm +.else +ARCHDIR = ${MACHINE_ARCH} +.endif +.if exists(${.CURDIR}/mach/${ARCHDIR}/Makefile.inc) +.include "${.CURDIR}/mach/${ARCHDIR}/Makefile.inc" .endif .PATH: ${.CURDIR}/mach diff --git a/makedefs/MakeInc.def b/makedefs/MakeInc.def index b08213bc0..0d4989a32 100644 --- a/makedefs/MakeInc.def +++ b/makedefs/MakeInc.def @@ -84,47 +84,10 @@ $(error There were $(words $(KERNEL_CONFIG)) parameters passed to KERNEL_CONFIG Are you sure? To specify multiple configurations please use KERNEL_CONFIGS) endif -# -# Machine Configuration options -# -# ppc supported configurations : none -# i386 supported configurations : none -# arm supported configurations : LN2410SBC MX31ADS INTEGRATORCP S5I3000SMDK S5L8900XFPGA S5L8900XRB OLOCREEK -# -ifndef SUPPORTED_MACHINE_CONFIGS -export SUPPORTED_MACHINE_CONFIGS = LN2410SBC MX31ADS INTEGRATORCP S5I3000SMDK S5L8900XFPGA S5L8900XRB OLOCREEK DEFAULT -endif - -export DEFAULT_ARM_MACHINE_CONFIG = S5L8900XRB - ifndef MACHINE_CONFIG export MACHINE_CONFIG = DEFAULT endif -ifndef MACHINE_FLAGS_LN2410SBC -export MACHINE_FLAGS_LN2410SBC = -DARM_BOARD_CONFIG_LN2410_920T -endif -ifndef MACHINE_FLAGS_MX31ADS -export MACHINE_FLAGS_MX31ADS = -DARM_BOARD_CONFIG_MX31ADS_1136JFS -endif -ifndef MACHINE_FLAGS_INTEGRATORCP -export MACHINE_FLAGS_INTEGRATORCP = -DARM_BOARD_CONFIG_INTEGRATORCP_1136JFS -endif -ifndef MACHINE_FLAGS_S5I3000SMDK -export MACHINE_FLAGS_S5I3000SMDK = -DARM_BOARD_CONFIG_S5I3000SMDK_1176JZFS -endif -ifndef MACHINE_FLAGS_S5L8900XFPGA -export MACHINE_FLAGS_S5L8900XFPGA = -DARM_BOARD_CONFIG_S5L8900XFPGA_1136JFS -endif -ifndef MACHINE_FLAGS_S5L8900XRB -export MACHINE_FLAGS_S5L8900XRB = -DARM_BOARD_CONFIG_S5L8900XRB -endif -ifndef MACHINE_FLAGS_OLOCREEK -export MACHINE_FLAGS_OLOCREEK = -DARM_BOARD_CONFIG_OLOCREEK -endif -ifndef MACHINE_FLAGS_DEFAULT -export MACHINE_FLAGS_DEFAULT = -endif # # Target configuration options. NOTE - target configurations will @@ -234,13 +197,6 @@ ARCH_FLAGS_PPC = -arch ppc ARCH_FLAGS_I386 = -arch i386 ARCH_FLAGS_ARM = $($(addsuffix $(MACHINE_CONFIG),ARCH_FLAGS_ARM_)) -ARCH_FLAGS_ARM_LN2410SBC = -arch arm -ARCH_FLAGS_ARM_MX31ADS = -arch armv6 -ARCH_FLAGS_ARM_INTEGRATORCP = -arch armv6 -ARCH_FLAGS_ARM_S5I3000SMDK = -arch armv6 -ARCH_FLAGS_ARM_S5L8900XFPGA = -arch armv6 -ARCH_FLAGS_ARM_S5L8900XRB = -arch armv6 -ARCH_FLAGS_ARM_OLOCREEK = -arch arm # # Default CFLAGS @@ -290,6 +246,12 @@ endif ifeq (-arch armv6,$(ARCH_FLAGS_ARM)) CFLAGS_ARM += -mthumb endif +ifeq (-arch armv5,$(ARCH_FLAGS_ARM)) +CFLAGS_ARM += -mthumb +endif +ifeq (-arch xscale,$(ARCH_FLAGS_ARM)) +CFLAGS_ARM += -mthumb +endif export CFLAGS_RELEASEPPC = -O2 -mcpu=750 -mmultiple export CFLAGS_RELEASE_TRACEPPC = -O2 -mcpu=750 -mmultiple diff --git a/osfmk/conf/MASTER.i386 b/osfmk/conf/MASTER.i386 index 01731eae5..5eb745c87 100644 --- a/osfmk/conf/MASTER.i386 +++ b/osfmk/conf/MASTER.i386 @@ -58,5 +58,9 @@ options PAE options X86_64 options DISPATCH_COUNTS +# +# Note: MAC options must be set in all the bsd/conf, osfmk/conf, and +# security/conf MASTER files. +# options CONFIG_MACF # Mandatory Access Control Framework #options CONFIG_MACF_MACH # MACF applied to Mach services diff --git a/osfmk/conf/MASTER.ppc b/osfmk/conf/MASTER.ppc index d655eea9e..09dfbf8ee 100644 --- a/osfmk/conf/MASTER.ppc +++ b/osfmk/conf/MASTER.ppc @@ -58,5 +58,9 @@ options POWERMAC options DISPATCH_COUNTS +# +# Note: MAC options must be set in all the bsd/conf, osfmk/conf, and +# security/conf MASTER files. +# options CONFIG_MACF # Mandatory Access Control Framework #options CONFIG_MACF_MACH # MACF applied to Mach services diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index eb86e791e..6743dc70b 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -670,11 +670,13 @@ panic_io_port_read(void) { /* For use with the MP rendezvous mechanism */ +#if !CONFIG_EMBEDDED static void machine_halt_cpu(__unused void *arg) { panic_io_port_read(); pmCPUHalt(PM_HALT_DEBUG); } +#endif void Debugger( diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c index 539a82fde..64c21447e 100644 --- a/osfmk/i386/acpi.c +++ b/osfmk/i386/acpi.c @@ -173,7 +173,11 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) acpi_sleep_cpu(func, refcon); #endif - /* reset UART if kprintf is enabled */ + /* Reset UART if kprintf is enabled. + * However kprintf should not be used before rtc_sleep_wakeup() + * for compatibility with firewire kprintf. + */ + if (FALSE == disable_serial_output) serial_init(); diff --git a/osfmk/i386/hpet.c b/osfmk/i386/hpet.c index 52c70d903..5ae9621d5 100644 --- a/osfmk/i386/hpet.c +++ b/osfmk/i386/hpet.c @@ -280,24 +280,6 @@ hpet_init(void) DBG(" CVT: HPET to BUS = %08X.%08X\n", (uint32_t)(hpet2bus >> 32), (uint32_t)hpet2bus); - /* Make sure the counter is off in the HPET configuration flags */ - uint64_t hpetcon = ((hpetReg_t *)hpetArea)->GEN_CONF; - hpetcon = hpetcon & ~1; - ((hpetReg_t *)hpetArea)->GEN_CONF = hpetcon; - - /* - * Convert current TSC to HPET value, - * set it, and start it ticking. - */ - uint64_t currtsc = rdtsc64(); - uint64_t tscInHPET = tmrCvt(currtsc, tsc2hpet); - ((hpetReg_t *)hpetArea)->MAIN_CNT = tscInHPET; - hpetcon = hpetcon | 1; - ((hpetReg_t *)hpetArea)->GEN_CONF = hpetcon; - kprintf("HPET started: TSC = %08X.%08X, HPET = %08X.%08X\n", - (uint32_t)(currtsc >> 32), (uint32_t)currtsc, - (uint32_t)(tscInHPET >> 32), (uint32_t)tscInHPET); - #if MACH_KDB db_display_hpet((hpetReg_t *)hpetArea); /* (BRINGUP) */ #endif @@ -317,8 +299,13 @@ hpet_get_info(hpetInfo_t *info) info->hpet2tsc = hpet2tsc; info->bus2hpet = bus2hpet; info->hpet2bus = hpet2bus; - info->rcbaArea = rcbaArea; - info->rcbaAreap = rcbaAreap; + /* + * XXX + * We're repurposing the rcbaArea so we can use the HPET. + * Eventually we'll rename this correctly. + */ + info->rcbaArea = hpetArea; + info->rcbaAreap = hpetAreap; } diff --git a/osfmk/i386/machine_check.c b/osfmk/i386/machine_check.c index 8abf223f9..214a588b7 100644 --- a/osfmk/i386/machine_check.c +++ b/osfmk/i386/machine_check.c @@ -308,11 +308,18 @@ mca_dump(void) { ia32_mcg_status_t status; - mca_exception_taken = TRUE; mca_save_state(); - /* Serialize in case of multiple simultaneous machine-checks */ + /* + * Serialize in case of multiple simultaneous machine-checks. + * Only the first caller is allowed to print MCA registers. + */ simple_lock(&mca_lock); + if (mca_exception_taken) { + simple_unlock(&mca_lock); + return; + } + mca_exception_taken = TRUE; /* * Report machine-check capabilities: diff --git a/osfmk/i386/misc_protos.h b/osfmk/i386/misc_protos.h index 8d9036a15..93d45455d 100644 --- a/osfmk/i386/misc_protos.h +++ b/osfmk/i386/misc_protos.h @@ -126,7 +126,7 @@ extern void rtc_clock_stepping( extern void rtc_clock_stepped( uint32_t new_frequency, uint32_t old_frequency); -extern void rtc_clock_napped(uint64_t); +extern void rtc_clock_napped(uint64_t, uint64_t); extern void x86_lowmem_free(void); diff --git a/osfmk/i386/mp_desc.c b/osfmk/i386/mp_desc.c index 12a071c2c..75bbe25cf 100644 --- a/osfmk/i386/mp_desc.c +++ b/osfmk/i386/mp_desc.c @@ -457,8 +457,11 @@ fast_syscall_init64(void) */ wrmsr64(MSR_IA32_KERNEL_GS_BASE, UBER64((unsigned long)current_cpu_datap())); + +#if ONLY_SAFE_FOR_LINDA_SERIAL kprintf("fast_syscall_init64() KERNEL_GS_BASE=0x%016llx\n", rdmsr64(MSR_IA32_KERNEL_GS_BASE)); +#endif } /* @@ -725,7 +728,9 @@ cpu_desc_load64(cpu_data_t *cdp) ml_load_desc64(); +#if ONLY_SAFE_FOR_LINDA_SERIAL kprintf("64-bit descriptor tables loaded\n"); +#endif } void diff --git a/osfmk/i386/pmap.c b/osfmk/i386/pmap.c index 36dae2f3e..72ecf5f76 100644 --- a/osfmk/i386/pmap.c +++ b/osfmk/i386/pmap.c @@ -221,7 +221,11 @@ void dump_4GB_pdpt_thread(thread_t tp); #define iswired(pte) ((pte) & INTEL_PTE_WIRED) int nx_enabled = 1; /* enable no-execute protection */ +#ifdef CONFIG_EMBEDDED +int allow_data_exec = 0; /* no exec from data, embedded is hardcore like that */ +#else int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */ +#endif int allow_stack_exec = 0; /* No apps may execute from the stack by default */ int cpu_64bit = 0; diff --git a/osfmk/i386/rtclock.c b/osfmk/i386/rtclock.c index bad2abbe7..a1784f3bf 100644 --- a/osfmk/i386/rtclock.c +++ b/osfmk/i386/rtclock.c @@ -107,6 +107,28 @@ extern uint64_t _rtc_nanotime_read( rtc_nanotime_t rtc_nanotime_info = {0,0,0,0,1,0}; +/* + * tsc_to_nanoseconds: + * + * Basic routine to convert a raw 64 bit TSC value to a + * 64 bit nanosecond value. The conversion is implemented + * based on the scale factor and an implicit 32 bit shift. + */ +static inline uint64_t +_tsc_to_nanoseconds(uint64_t value) +{ + asm volatile("movl %%edx,%%esi ;" + "mull %%ecx ;" + "movl %%edx,%%edi ;" + "movl %%esi,%%eax ;" + "mull %%ecx ;" + "addl %%edi,%%eax ;" + "adcl $0,%%edx " + : "+A" (value) : "c" (rtc_nanotime_info.scale) : "esi", "edi"); + + return (value); +} + static uint32_t deadline_to_decrementer( uint64_t deadline, @@ -234,26 +256,31 @@ rtc_nanotime_read(void) /* * rtc_clock_napped: * - * Invoked from power manangement when we have awoken from a nap (C3/C4) - * during which the TSC lost counts. The nanotime data is updated according - * to the provided value which indicates the number of nanoseconds that the - * TSC was not counting. - * - * The caller must guarantee non-reentrancy. + * Invoked from power management when we exit from a low C-State (>= C4) + * and the TSC has stopped counting. The nanotime data is updated according + * to the provided value which represents the new value for nanotime. */ void -rtc_clock_napped( - uint64_t delta) +rtc_clock_napped(uint64_t base, uint64_t tsc_base) { rtc_nanotime_t *rntp = &rtc_nanotime_info; - uint32_t generation; + uint64_t oldnsecs; + uint64_t newnsecs; + uint64_t tsc; assert(!ml_get_interrupts_enabled()); - generation = rntp->generation; - rntp->generation = 0; - rntp->ns_base += delta; - rntp->generation = ((generation + 1) != 0) ? (generation + 1) : 1; - rtc_nanotime_set_commpage(rntp); + tsc = rdtsc64(); + oldnsecs = rntp->ns_base + _tsc_to_nanoseconds(tsc - rntp->tsc_base); + newnsecs = base + _tsc_to_nanoseconds(tsc - tsc_base); + + /* + * Only update the base values if time using the new base values + * is later than the time using the old base values. + */ + if (oldnsecs < newnsecs) { + _rtc_nanotime_store(tsc_base, base, rntp->scale, rntp->shift, rntp); + rtc_nanotime_set_commpage(rntp); + } } void diff --git a/osfmk/i386/startup64.c b/osfmk/i386/startup64.c index 0998530b1..b252c496a 100644 --- a/osfmk/i386/startup64.c +++ b/osfmk/i386/startup64.c @@ -115,7 +115,9 @@ cpu_IA32e_enable(cpu_data_t *cdp) : "i" (CR0_PG) : "eax" ); +#if ONLY_SAFE_FOR_LINDA_SERIAL kprintf("cpu_IA32e_enable(%p)\n", cdp); +#endif if ((rdmsr64(MSR_IA32_EFER) & MSR_IA32_EFER_LMA) == 0) panic("cpu_IA32e_enable() MSR_IA32_EFER_LMA not asserted"); diff --git a/osfmk/i386/thread.h b/osfmk/i386/thread.h index 85a82e880..badd5491f 100644 --- a/osfmk/i386/thread.h +++ b/osfmk/i386/thread.h @@ -79,7 +79,7 @@ #include /* - * i386_saved_state: + * x86_saved_state32/64: * * Has been exported to servers. See: mach/i386/thread_status.h * diff --git a/osfmk/i386/tsc.c b/osfmk/i386/tsc.c index 724bb0f9f..19b7469a6 100644 --- a/osfmk/i386/tsc.c +++ b/osfmk/i386/tsc.c @@ -160,13 +160,16 @@ tsc_init(void) * Get the TSC increment. The TSC is incremented by this * on every bus tick. Calculate the TSC conversion factors * to and from nano-seconds. + * The tsc granularity is also called the "bus ratio". If the N/2 bit + * is set this indicates the bus ration is 0.5 more than this - i.e. + * that the true bus ratio is (2*tscGranularity + 1)/2. */ if (cpuid_info()->cpuid_family == CPU_FAMILY_PENTIUM_M) { uint64_t prfsts; prfsts = rdmsr64(IA32_PERF_STS); tscGranularity = (uint32_t)bitfield(prfsts, 44, 40); - N_by_2_bus_ratio = prfsts & bit(46); + N_by_2_bus_ratio = (prfsts & bit(46)) != 0; } else { panic("rtclock_init: unknown CPU family: 0x%X\n", @@ -174,20 +177,20 @@ tsc_init(void) } if (N_by_2_bus_ratio) - tscFCvtt2n = busFCvtt2n * 2 / (uint64_t)tscGranularity; + tscFCvtt2n = busFCvtt2n * 2 / (1 + 2*tscGranularity); else - tscFCvtt2n = busFCvtt2n / (uint64_t)tscGranularity; + tscFCvtt2n = busFCvtt2n / tscGranularity; tscFreq = ((1 * Giga) << 32) / tscFCvtt2n; tscFCvtn2t = 0xFFFFFFFFFFFFFFFFULL / tscFCvtt2n; kprintf(" TSC: Frequency = %6d.%04dMHz, " - "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X, gran = %lld\n", + "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X, gran = %lld%s\n", (uint32_t)(tscFreq / Mega), (uint32_t)(tscFreq % Mega), (uint32_t)(tscFCvtt2n >> 32), (uint32_t)tscFCvtt2n, (uint32_t)(tscFCvtn2t >> 32), (uint32_t)tscFCvtn2t, - tscGranularity); + tscGranularity, N_by_2_bus_ratio ? " (N/2)" : ""); /* * Calculate conversion from BUS to TSC diff --git a/osfmk/kern/etimer.h b/osfmk/kern/etimer.h index 84674a990..29eebc4a8 100644 --- a/osfmk/kern/etimer.h +++ b/osfmk/kern/etimer.h @@ -53,8 +53,6 @@ extern int setPop(uint64_t time); extern void etimer_resync_deadlines(void); -extern uint32_t rtclock_tick_interval; - #if 0 /* this is currently still MD */ #pragma pack(push,4) struct rtclock_timer_t { diff --git a/osfmk/kern/locks.c b/osfmk/kern/locks.c index 064a58252..5f9d4d80a 100644 --- a/osfmk/kern/locks.c +++ b/osfmk/kern/locks.c @@ -561,25 +561,20 @@ lck_mtx_lock_wait ( priority = self->sched_pri; if (priority < self->priority) priority = self->priority; - if (priority > MINPRI_KERNEL) - priority = MINPRI_KERNEL; - else if (priority < BASEPRI_DEFAULT) priority = BASEPRI_DEFAULT; thread_lock(holder); if (mutex->lck_mtx_pri == 0) holder->promotions++; - if (holder->priority < MINPRI_KERNEL) { - holder->sched_mode |= TH_MODE_PROMOTED; - if ( mutex->lck_mtx_pri < priority && + holder->sched_mode |= TH_MODE_PROMOTED; + if ( mutex->lck_mtx_pri < priority && holder->sched_pri < priority ) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, holder->sched_pri, priority, (int)holder, (int)lck, 0); - set_sched_pri(holder, priority); - } + set_sched_pri(holder, priority); } thread_unlock(holder); splx(s); @@ -654,15 +649,13 @@ lck_mtx_lock_acquire( thread_lock(thread); thread->promotions++; - if (thread->priority < MINPRI_KERNEL) { - thread->sched_mode |= TH_MODE_PROMOTED; - if (thread->sched_pri < priority) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, + thread->sched_mode |= TH_MODE_PROMOTED; + if (thread->sched_pri < priority) { + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, thread->sched_pri, priority, 0, (int)lck, 0); - set_sched_pri(thread, priority); - } + set_sched_pri(thread, priority); } thread_unlock(thread); splx(s); diff --git a/osfmk/kern/mach_clock.c b/osfmk/kern/mach_clock.c index 2c7b7f254..779855296 100644 --- a/osfmk/kern/mach_clock.c +++ b/osfmk/kern/mach_clock.c @@ -106,9 +106,7 @@ hertz_tick( #endif { processor_t processor = current_processor(); -#if !GPROF thread_t thread = current_thread(); -#endif timer_t state; if (usermode) { @@ -117,8 +115,11 @@ hertz_tick( state = &PROCESSOR_DATA(processor, user_state); } else { - TIMER_BUMP(&thread->system_timer, ticks); - + /* If this thread is idling, do not charge that time as system time */ + if ((thread->state & TH_IDLE) == 0) { + TIMER_BUMP(&thread->system_timer, ticks); + } + if (processor->state == PROCESSOR_IDLE) state = &PROCESSOR_DATA(processor, idle_state); else diff --git a/osfmk/kern/priority.c b/osfmk/kern/priority.c index 4f08fd378..6564cc97c 100644 --- a/osfmk/kern/priority.c +++ b/osfmk/kern/priority.c @@ -96,7 +96,7 @@ thread_quantum_expire( /* * Check for fail-safe trip. */ - if (!(thread->sched_mode & TH_MODE_TIMESHARE)) { + if (!(thread->sched_mode & (TH_MODE_TIMESHARE|TH_MODE_PROMOTED))) { uint64_t new_computation; new_computation = processor->quantum_end; @@ -115,7 +115,6 @@ thread_quantum_expire( thread->safe_release = sched_tick + sched_safe_duration; thread->sched_mode |= (TH_MODE_FAILSAFE|TH_MODE_TIMESHARE); - thread->sched_mode &= ~TH_MODE_PREEMPT; } } diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index 0cbde484c..e2027c066 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -150,6 +150,7 @@ void (*pm_tick_callout)(void) = NULL; void wait_queues_init(void) __attribute__((section("__TEXT, initcode"))); static void load_shift_init(void) __attribute__((section("__TEXT, initcode"))); +static void preempt_pri_init(void) __attribute__((section("__TEXT, initcode"))); static thread_t thread_select_idle( thread_t thread, @@ -181,8 +182,6 @@ boolean_t thread_runnable( #endif /*DEBUG*/ - - /* * State machine * @@ -243,6 +242,7 @@ struct wait_queue wait_queues[NUMQUEUES]; ((((int)(event) < 0)? ~(int)(event): (int)(event)) % NUMQUEUES) int8_t sched_load_shifts[NRQS]; +int sched_preempt_pri[NRQBM]; void sched_init(void) @@ -262,6 +262,7 @@ sched_init(void) wait_queues_init(); load_shift_init(); + preempt_pri_init(); simple_lock_init(&rt_lock, 0); run_queue_init(&rt_runq); sched_tick = 0; @@ -299,9 +300,15 @@ sched_timebase_init(void) /* scheduler tick interval */ clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT, NSEC_PER_USEC, &abstime); - assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); sched_tick_interval = abstime; +#if DEBUG + printf("Quantum: %d. Smallest quantum: %d. Min Rt/Max Rt: %d/%d." + " Tick: %d.\n", + std_quantum, min_std_quantum, min_rt_quantum, max_rt_quantum, + sched_tick_interval); +#endif + /* * Compute conversion factor from usage to * timesharing priorities with 5/8 ** n aging. @@ -343,6 +350,18 @@ load_shift_init(void) } } +static void +preempt_pri_init(void) +{ + int i, *p = sched_preempt_pri; + + for (i = BASEPRI_FOREGROUND + 1; i < MINPRI_KERNEL; ++i) + setbit(i, p); + + for (i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) + setbit(i, p); +} + /* * Thread wait timer expiration. */ @@ -1200,8 +1219,8 @@ thread_select( ((queue_entry_t)thread)->next->prev = q; q->next = ((queue_entry_t)thread)->next; thread->runq = PROCESSOR_NULL; - assert(thread->sched_mode & TH_MODE_PREEMPT); runq->count--; runq->urgency--; + assert(runq->urgency >= 0); if (queue_empty(q)) { if (runq->highq != IDLEPRI) clrbit(MAXPRI - runq->highq, runq->bitmap); @@ -1916,8 +1935,9 @@ run_queue_dequeue( thread->runq = PROCESSOR_NULL; rq->count--; - if (thread->sched_mode & TH_MODE_PREEMPT) - rq->urgency--; + if (testbit(rq->highq, sched_preempt_pri)) { + rq->urgency--; assert(rq->urgency >= 0); + } if (queue_empty(queue)) { if (rq->highq != IDLEPRI) clrbit(MAXPRI - rq->highq, rq->bitmap); @@ -1971,7 +1991,6 @@ realtime_queue_insert( } thread->runq = RT_RUNQ; - assert(thread->sched_mode & TH_MODE_PREEMPT); rq->count++; rq->urgency++; simple_unlock(&rt_lock); @@ -2060,7 +2079,7 @@ processor_enqueue( enqueue_head(queue, (queue_entry_t)thread); thread->runq = processor; - if (thread->sched_mode & TH_MODE_PREEMPT) + if (testbit(thread->sched_pri, sched_preempt_pri)) rq->urgency++; rq->count++; @@ -2106,7 +2125,7 @@ processor_setrun( /* * Set preemption mode. */ - if (thread->sched_mode & TH_MODE_PREEMPT) + if (testbit(thread->sched_pri, sched_preempt_pri)) preempt = (AST_PREEMPT | AST_URGENT); else if (thread->sched_mode & TH_MODE_TIMESHARE && thread->priority < BASEPRI_BACKGROUND) @@ -2409,8 +2428,9 @@ processor_queue_shutdown( thread->runq = PROCESSOR_NULL; rq->count--; - if (thread->sched_mode & TH_MODE_PREEMPT) - rq->urgency--; + if (testbit(pri, sched_preempt_pri)) { + rq->urgency--; assert(rq->urgency >= 0); + } if (queue_empty(queue)) { if (pri != IDLEPRI) clrbit(MAXPRI - pri, rq->bitmap); @@ -2524,15 +2544,6 @@ set_sched_pri( { boolean_t removed = run_queue_remove(thread); - if ( !(thread->sched_mode & TH_MODE_TIMESHARE) && - (priority >= BASEPRI_PREEMPT || - (thread->task_priority < MINPRI_KERNEL && - thread->task_priority >= BASEPRI_BACKGROUND && - priority > thread->task_priority) ) ) - thread->sched_mode |= TH_MODE_PREEMPT; - else - thread->sched_mode &= ~TH_MODE_PREEMPT; - thread->sched_pri = priority; if (removed) thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ); @@ -2630,9 +2641,9 @@ run_queue_remove( */ remqueue(&rq->queues[0], (queue_entry_t)thread); rq->count--; - if (thread->sched_mode & TH_MODE_PREEMPT) - rq->urgency--; - assert(rq->urgency >= 0); + if (testbit(thread->sched_pri, sched_preempt_pri)) { + rq->urgency--; assert(rq->urgency >= 0); + } if (queue_empty(rq->queues + thread->sched_pri)) { /* update run queue status */ @@ -2741,8 +2752,9 @@ steal_thread( thread->runq = PROCESSOR_NULL; rq->count--; - if (thread->sched_mode & TH_MODE_PREEMPT) - rq->urgency--; + if (testbit(pri, sched_preempt_pri)) { + rq->urgency--; assert(rq->urgency >= 0); + } if (queue_empty(queue)) { if (pri != IDLEPRI) clrbit(MAXPRI - pri, rq->bitmap); @@ -2807,9 +2819,6 @@ processor_idle( break; } - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (int)thread, 0, 0, 0, 0); - timer_switch(&PROCESSOR_DATA(processor, idle_state), mach_absolute_time(), &PROCESSOR_DATA(processor, system_state)); PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state); @@ -2829,8 +2838,8 @@ processor_idle( processor->next_thread = THREAD_NULL; processor->state = PROCESSOR_RUNNING; - if ( processor->runq.highq > new_thread->sched_pri || - rt_runq.highq >= new_thread->sched_pri ) { + if ( processor->runq.highq > new_thread->sched_pri || + (rt_runq.highq > 0 && rt_runq.highq >= new_thread->sched_pri) ) { processor->deadline = UINT64_MAX; pset_unlock(pset); @@ -2839,11 +2848,17 @@ processor_idle( thread_setrun(new_thread, SCHED_HEADQ); thread_unlock(new_thread); + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (int)thread, (int)state, 0, 0, 0); + return (THREAD_NULL); } pset_unlock(pset); + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (int)thread, (int)state, (int)new_thread, 0, 0); + return (new_thread); } else @@ -2870,12 +2885,18 @@ processor_idle( thread_setrun(new_thread, SCHED_HEADQ); thread_unlock(new_thread); + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (int)thread, (int)state, 0, 0, 0); + return (THREAD_NULL); } } pset_unlock(pset); + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (int)thread, (int)state, 0, 0, 0); + return (THREAD_NULL); } diff --git a/osfmk/kern/syscall_subr.c b/osfmk/kern/syscall_subr.c index f7855e114..311e96c7d 100644 --- a/osfmk/kern/syscall_subr.c +++ b/osfmk/kern/syscall_subr.c @@ -331,7 +331,6 @@ thread_depress_abstime( self->sched_pri = DEPRESSPRI; myprocessor->current_pri = self->sched_pri; - self->sched_mode &= ~TH_MODE_PREEMPT; self->sched_mode |= TH_MODE_DEPRESS; if (interval != 0) { @@ -427,7 +426,6 @@ thread_poll_yield( if (!(self->sched_mode & TH_MODE_ISDEPRESSED)) { self->sched_pri = DEPRESSPRI; myprocessor->current_pri = self->sched_pri; - self->sched_mode &= ~TH_MODE_PREEMPT; } self->computation_epoch = abstime; self->computation_metered = 0; diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index 8387019b8..4cca24656 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -185,14 +185,13 @@ struct thread { integer_t sched_mode; /* scheduling mode bits */ #define TH_MODE_REALTIME 0x0001 /* time constraints supplied */ #define TH_MODE_TIMESHARE 0x0002 /* use timesharing algorithm */ -#define TH_MODE_PREEMPT 0x0004 /* can preempt kernel contexts */ -#define TH_MODE_FAILSAFE 0x0008 /* fail-safe has tripped */ -#define TH_MODE_PROMOTED 0x0010 /* sched pri has been promoted */ -#define TH_MODE_ABORT 0x0020 /* abort interruptible waits */ -#define TH_MODE_ABORTSAFELY 0x0040 /* ... but only those at safe point */ +#define TH_MODE_FAILSAFE 0x0004 /* fail-safe has tripped */ +#define TH_MODE_PROMOTED 0x0008 /* sched pri has been promoted */ +#define TH_MODE_ABORT 0x0010 /* abort interruptible waits */ +#define TH_MODE_ABORTSAFELY 0x0020 /* ... but only those at safe point */ #define TH_MODE_ISABORTED (TH_MODE_ABORT | TH_MODE_ABORTSAFELY) -#define TH_MODE_DEPRESS 0x0080 /* normal depress yield */ -#define TH_MODE_POLLDEPRESS 0x0100 /* polled depress yield */ +#define TH_MODE_DEPRESS 0x0040 /* normal depress yield */ +#define TH_MODE_POLLDEPRESS 0x0080 /* polled depress yield */ #define TH_MODE_ISDEPRESSED (TH_MODE_DEPRESS | TH_MODE_POLLDEPRESS) integer_t sched_pri; /* scheduled (current) priority */ diff --git a/osfmk/kern/thread_act.c b/osfmk/kern/thread_act.c index da6510382..4fcb5f957 100644 --- a/osfmk/kern/thread_act.c +++ b/osfmk/kern/thread_act.c @@ -759,7 +759,6 @@ special_handler_continue(void) thread->sched_pri = DEPRESSPRI; myprocessor->current_pri = thread->sched_pri; - thread->sched_mode &= ~TH_MODE_PREEMPT; } thread_unlock(thread); splx(s); diff --git a/osfmk/mach/i386/thread_status.h b/osfmk/mach/i386/thread_status.h index d8c38843f..173e79a8b 100644 --- a/osfmk/mach/i386/thread_status.h +++ b/osfmk/mach/i386/thread_status.h @@ -361,7 +361,7 @@ struct x86_saved_state32_tagged { typedef struct x86_saved_state32_tagged x86_saved_state32_tagged_t; struct x86_sframe32 { -/* + /* * in case we throw a fault reloading * segment registers on a return out of * the kernel... the 'slf' state is only kept diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index 8a73ba9d8..e28a2c537 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -345,6 +345,8 @@ __END_DECLS #define CPU_SUBTYPE_ARM_ALL ((cpu_subtype_t) 0) #define CPU_SUBTYPE_ARM_V4T ((cpu_subtype_t) 5) #define CPU_SUBTYPE_ARM_V6 ((cpu_subtype_t) 6) +#define CPU_SUBTYPE_ARM_V5TEJ ((cpu_subtype_t) 7) +#define CPU_SUBTYPE_ARM_XSCALE ((cpu_subtype_t) 8) /* * CPU families (sysctl hw.cpufamily) @@ -368,6 +370,7 @@ __END_DECLS #define CPUFAMILY_INTEL_6_26 0x6b5a4cd2 /* Nehalem */ #define CPUFAMILY_ARM_9 0xe73283ae #define CPUFAMILY_ARM_11 0x8ff620d8 +#define CPUFAMILY_ARM_XSCALE 0x53b005f5 #define CPUFAMILY_INTEL_YONAH CPUFAMILY_INTEL_6_14 #define CPUFAMILY_INTEL_MEROM CPUFAMILY_INTEL_6_15 diff --git a/osfmk/vm/bsd_vm.c b/osfmk/vm/bsd_vm.c index 6721d47bf..b02804825 100644 --- a/osfmk/vm/bsd_vm.c +++ b/osfmk/vm/bsd_vm.c @@ -321,6 +321,8 @@ trigger_name_to_port( extern int uiomove64(addr64_t, int, void *); #define MAX_RUN 32 +unsigned long vm_cs_tainted_forces = 0; + int memory_object_control_uiomove( memory_object_control_t control, @@ -396,8 +398,18 @@ memory_object_control_uiomove( */ assert(!dst_page->encrypted); - if (mark_dirty) + if (mark_dirty) { dst_page->dirty = TRUE; + if (dst_page->cs_validated) { + /* + * CODE SIGNING: + * We're modifying a code-signed + * page: assume that it is now tainted. + */ + dst_page->cs_tainted = TRUE; + vm_cs_tainted_forces++; + } + } dst_page->busy = TRUE; page_run[cur_run++] = dst_page; diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index 87ffc3ee7..809d71e17 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -149,6 +149,12 @@ extern void vm_fault_classify(vm_object_t object, extern void vm_fault_classify_init(void); #endif + +unsigned long vm_cs_validates = 0; +unsigned long vm_cs_revalidates = 0; +unsigned long vm_cs_query_modified = 0; +unsigned long vm_cs_validated_dirtied = 0; + /* * Routine: vm_fault_init * Purpose: @@ -1988,19 +1994,21 @@ vm_fault_enter(vm_page_t m, cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK; - if (m->object->code_signed && !m->cs_validated && - pmap != kernel_pmap) { - /* - * CODE SIGNING: - * This page comes from a VM object backed by a - * signed memory object and it hasn't been validated yet. - * We're about to enter it into a process address space, - * so we need to validate its signature now. - */ + if (m->object->code_signed && pmap != kernel_pmap && + (!m->cs_validated || m->wpmapped)) { vm_object_lock_assert_exclusive(m->object); - /* VM map still locked, so 1 ref will remain on VM object */ + if (m->cs_validated && m->wpmapped) { + vm_cs_revalidates++; + } + /* + * CODE SIGNING: + * This page comes from a VM object backed by a signed + * memory object. We are about to enter it into a process + * address space, so we need to validate its signature. + */ + /* VM map is locked, so 1 ref will remain on VM object */ vm_page_validate_cs(m); } @@ -2087,6 +2095,10 @@ vm_fault_enter(vm_page_t m, * that's needed for an AtomicCompareAndSwap */ m->pmapped = TRUE; + if (prot & VM_PROT_WRITE) { + vm_object_lock_assert_exclusive(m->object); + m->wpmapped = TRUE; + } PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired); } @@ -2273,7 +2285,6 @@ RetryFault: */ if (wired) { fault_type = prot | VM_PROT_WRITE; - /* * since we're treating this fault as a 'write' * we must hold the top object lock exclusively @@ -2500,9 +2511,10 @@ RetryFault: } ASSERT_PAGE_DECRYPTED(m); - if (m->object->code_signed && !m->cs_validated) { + if (m->object->code_signed && map != kernel_map && + (!m->cs_validated || m->wpmapped)) { /* - * We will need to validate this page + * We might need to validate this page * against its code signature, so we * want to hold the VM object exclusively. */ @@ -2547,8 +2559,23 @@ RetryFault: * --> must disallow write. */ - if (object == cur_object && object->copy == VM_OBJECT_NULL) + if (object == cur_object && object->copy == VM_OBJECT_NULL) { + if ((fault_type & VM_PROT_WRITE) == 0) { + /* + * This is not a "write" fault, so we + * might not have taken the object lock + * exclusively and we might not be able + * to update the "wpmapped" bit in + * vm_fault_enter(). + * Let's just grant read access to + * the page for now and we'll + * soft-fault again if we need write + * access later... + */ + prot &= ~VM_PROT_WRITE; + } goto FastPmapEnter; + } if ((fault_type & VM_PROT_WRITE) == 0) { @@ -4117,13 +4144,51 @@ vm_page_validate_cs( boolean_t validated, tainted; boolean_t busy_page; - vm_object_lock_assert_exclusive(page->object); - assert(!page->cs_validated); + vm_object_lock_assert_held(page->object); if (!cs_validation) { return; } + if (page->cs_validated && !page->cs_tainted && page->wpmapped) { + vm_object_lock_assert_exclusive(page->object); + + /* + * This page has already been validated and found to + * be valid. However, it was mapped for "write" access + * sometime in the past, so we have to check if it was + * modified. If so, it needs to be revalidated. + * If the page was already found to be "tainted", no + * need to re-validate. + */ + if (!page->dirty) { + vm_cs_query_modified++; + page->dirty = pmap_is_modified(page->phys_page); + } + if (page->dirty) { + /* + * The page is dirty, so let's clear its + * "validated" bit and re-validate it. + */ + if (cs_debug) { + printf("CODESIGNING: vm_page_validate_cs: " + "page %p obj %p off 0x%llx " + "was modified\n", + page, page->object, page->offset); + } + page->cs_validated = FALSE; + vm_cs_validated_dirtied++; + } + } + + if (page->cs_validated) { + return; + } + + vm_object_lock_assert_exclusive(page->object); + + vm_cs_validates++; + object = page->object; assert(object->code_signed); offset = page->offset; diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index f20b587c1..74e805b79 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -1749,10 +1749,13 @@ StartAgain: ; } for (; entry->vme_start < end; entry = entry->vme_next) { + /* + * Check if the mapping's attributes + * match the existing map entry. + */ if (entry == vm_map_to_entry(map) || entry->vme_start != tmp_start || entry->is_sub_map != is_submap || - entry->object.vm_object != object || entry->offset != tmp_offset || entry->needs_copy != needs_copy || entry->protection != cur_protection || @@ -1762,6 +1765,36 @@ StartAgain: ; /* not the same mapping ! */ RETURN(KERN_NO_SPACE); } + /* + * Check if the same object is being mapped. + */ + if (is_submap) { + if (entry->object.sub_map != + (vm_map_t) object) { + /* not the same submap */ + RETURN(KERN_NO_SPACE); + } + } else { + if (entry->object.vm_object != object) { + /* not the same VM object... */ + vm_object_t obj2; + + obj2 = entry->object.vm_object; + if ((obj2 == VM_OBJECT_NULL || + obj2->internal) && + (object == VM_OBJECT_NULL || + object->internal)) { + /* + * ... but both are + * anonymous memory, + * so equivalent. + */ + } else { + RETURN(KERN_NO_SPACE); + } + } + } + tmp_offset += entry->vme_end - entry->vme_start; tmp_start += entry->vme_end - entry->vme_start; if (entry->vme_end >= end) { @@ -7978,8 +8011,8 @@ submap_recurse: if(submap_entry->wired_count != 0 || - (sub_object->copy_strategy != - MEMORY_OBJECT_COPY_SYMMETRIC)) { + (sub_object->copy_strategy == + MEMORY_OBJECT_COPY_NONE)) { vm_object_lock(sub_object); vm_object_copy_slowly(sub_object, submap_entry->offset, @@ -8086,7 +8119,7 @@ submap_recurse: entry->max_protection |= submap_entry->max_protection; if(copied_slowly) { - entry->offset = 0; + entry->offset = local_start - old_start; entry->needs_copy = FALSE; entry->is_shared = FALSE; } else { diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index 60a80d38a..218a49157 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -1205,7 +1205,7 @@ vm_object_terminate( panic("vm_object_terminate.4 %p %p", object, p); } - if (!p->dirty && p->pmapped) + if (!p->dirty && p->wpmapped) p->dirty = pmap_is_modified(p->phys_page); if ((p->dirty || p->precious) && !p->error && object->alive) { diff --git a/osfmk/vm/vm_page.h b/osfmk/vm/vm_page.h index 53d137654..4052f9673 100644 --- a/osfmk/vm/vm_page.h +++ b/osfmk/vm/vm_page.h @@ -196,6 +196,8 @@ struct vm_page { fictitious:1, /* Physical page doesn't exist (O) */ pmapped:1, /* page has been entered at some * point into a pmap (O) */ + wpmapped:1, /* page has been entered at some + * point into a pmap for write (O) */ absent:1, /* Data has been requested, but is * not yet available (O) */ error:1, /* Data manager was unable to provide @@ -230,7 +232,7 @@ struct vm_page { /* other pages */ deactivated:1, zero_fill:1, - __unused_object_bits:9; /* 9 bits available here */ + __unused_object_bits:8; /* 8 bits available here */ ppnum_t phys_page; /* Physical address of page, passed * to pmap_enter (read-only) */ @@ -484,6 +486,12 @@ extern void vm_page_insert( vm_object_t object, vm_object_offset_t offset); +extern void vm_page_insert_internal( + vm_page_t page, + vm_object_t object, + vm_object_offset_t offset, + boolean_t queues_lock_held); + extern void vm_page_replace( vm_page_t mem, vm_object_t object, diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 7eeace1d0..0f3e790a6 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -370,6 +370,7 @@ unsigned int vm_page_speculative_target = 0; vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL; +unsigned long vm_cs_validated_resets = 0; /* * Routine: vm_backing_store_disable @@ -1632,12 +1633,30 @@ consider_inactive: vm_purgeable_q_advance_all(1); } - if (object->copy == VM_OBJECT_NULL && - (object->purgable == VM_PURGABLE_EMPTY || - object->purgable == VM_PURGABLE_VOLATILE)) { - assert(m->wire_count == 0); /* if it's wired, we can't put it on our queue */ - /* just stick it back on! */ - goto reactivate_page; + /* If the object is empty, the page must be reclaimed even if dirty or used. */ + /* If the page belongs to a volatile object, we stick it back on. */ + if (object->copy == VM_OBJECT_NULL) { + if(object->purgable == VM_PURGABLE_EMPTY && !m->cleaning) { + m->busy = TRUE; + if (m->pmapped == TRUE) { + /* unmap the page */ + refmod_state = pmap_disconnect(m->phys_page); + if (refmod_state & VM_MEM_MODIFIED) { + m->dirty = TRUE; + } + } + if (m->dirty || m->precious) { + /* we saved the cost of cleaning this page ! */ + vm_page_purged_count++; + } + goto reclaim_page; + } + if (object->purgable == VM_PURGABLE_VOLATILE) { + /* if it's wired, we can't put it on our queue */ + assert(m->wire_count == 0); + /* just stick it back on! */ + goto reactivate_page; + } } m->pageq.next = NULL; m->pageq.prev = NULL; @@ -2578,6 +2597,7 @@ vm_object_upl_request( wpl_array_t lite_list = NULL; vm_object_t last_copy_object; int delayed_unlock = 0; + int j; if (cntrl_flags & ~UPL_VALID_FLAGS) { /* @@ -2711,11 +2731,34 @@ vm_object_upl_request( } vm_object_unlock(object); VM_PAGE_GRAB_FICTITIOUS(alias_page); - vm_object_lock(object); + goto relock; } - if (delayed_unlock == 0) - vm_page_lock_queues(); + if (delayed_unlock == 0) { + /* + * pageout_scan takes the vm_page_lock_queues first + * then tries for the object lock... to avoid what + * is effectively a lock inversion, we'll go to the + * trouble of taking them in that same order... otherwise + * if this object contains the majority of the pages resident + * in the UBC (or a small set of large objects actively being + * worked on contain the majority of the pages), we could + * cause the pageout_scan thread to 'starve' in its attempt + * to find pages to move to the free queue, since it has to + * successfully acquire the object lock of any candidate page + * before it can steal/clean it. + */ + vm_object_unlock(object); +relock: + for (j = 0; ; j++) { + vm_page_lock_queues(); + if (vm_object_lock_try(object)) + break; + vm_page_unlock_queues(); + mutex_pause(j); + } + delayed_unlock = 1; + } if (cntrl_flags & UPL_COPYOUT_FROM) { upl->flags |= UPL_PAGE_SYNC_DONE; @@ -2848,6 +2891,7 @@ check_busy: dst_page->busy = was_busy; vm_page_lock_queues(); + delayed_unlock = 1; } if (dst_page->pageout_queue == TRUE) /* @@ -3001,6 +3045,7 @@ check_busy: upl_cow_again_pages += xfer_size >> PAGE_SHIFT; vm_page_lock_queues(); + delayed_unlock = 1; } /* * remember the copy object we synced with @@ -3070,14 +3115,8 @@ check_busy: } /* * need to allocate a page - * vm_page_alloc may grab the - * queues lock for a purgeable object - * so drop it */ - delayed_unlock = 0; - vm_page_unlock_queues(); - - dst_page = vm_page_alloc(object, dst_offset); + dst_page = vm_page_grab(); if (dst_page == VM_PAGE_NULL) { if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) { @@ -3096,14 +3135,41 @@ check_busy: * then try again for the same * offset... */ + delayed_unlock = 0; + vm_page_unlock_queues(); + vm_object_unlock(object); VM_PAGE_WAIT(); - vm_object_lock(object); + + /* + * pageout_scan takes the vm_page_lock_queues first + * then tries for the object lock... to avoid what + * is effectively a lock inversion, we'll go to the + * trouble of taking them in that same order... otherwise + * if this object contains the majority of the pages resident + * in the UBC (or a small set of large objects actively being + * worked on contain the majority of the pages), we could + * cause the pageout_scan thread to 'starve' in its attempt + * to find pages to move to the free queue, since it has to + * successfully acquire the object lock of any candidate page + * before it can steal/clean it. + */ + for (j = 0; ; j++) { + vm_page_lock_queues(); + + if (vm_object_lock_try(object)) + break; + vm_page_unlock_queues(); + mutex_pause(j); + } + delayed_unlock = 1; continue; } - dst_page->busy = FALSE; + vm_page_insert_internal(dst_page, object, dst_offset, TRUE); + dst_page->absent = TRUE; + dst_page->busy = FALSE; if (cntrl_flags & UPL_RET_ONLY_ABSENT) { /* @@ -3116,7 +3182,6 @@ check_busy: */ dst_page->clustered = TRUE; } - vm_page_lock_queues(); } /* * ENCRYPTED SWAP: @@ -3268,7 +3333,29 @@ check_busy: } delay_unlock_queues: if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) { + /* + * pageout_scan takes the vm_page_lock_queues first + * then tries for the object lock... to avoid what + * is effectively a lock inversion, we'll go to the + * trouble of taking them in that same order... otherwise + * if this object contains the majority of the pages resident + * in the UBC (or a small set of large objects actively being + * worked on contain the majority of the pages), we could + * cause the pageout_scan thread to 'starve' in its attempt + * to find pages to move to the free queue, since it has to + * successfully acquire the object lock of any candidate page + * before it can steal/clean it. + */ + vm_object_unlock(object); mutex_yield(&vm_page_queue_lock); + + for (j = 0; ; j++) { + if (vm_object_lock_try(object)) + break; + vm_page_unlock_queues(); + mutex_pause(j); + vm_page_lock_queues(); + } delayed_unlock = 1; } try_next_page: @@ -3279,7 +3366,7 @@ try_next_page: if (alias_page != NULL) { if (delayed_unlock == 0) { vm_page_lock_queues(); - delayed_unlock++; + delayed_unlock = 1; } vm_page_free(alias_page); } @@ -3760,6 +3847,7 @@ vm_map_enter_upl( cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK; m->pmapped = TRUE; + m->wpmapped = TRUE; PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE); } @@ -3844,6 +3932,7 @@ upl_commit_range( int delayed_unlock = 0; int clear_refmod = 0; int pgpgout_count = 0; + int j; *empty = FALSE; @@ -3887,17 +3976,35 @@ upl_commit_range( } else { shadow_object = object; } - vm_object_lock(shadow_object); - entry = offset/PAGE_SIZE; target_offset = (vm_object_offset_t)offset; + /* + * pageout_scan takes the vm_page_lock_queues first + * then tries for the object lock... to avoid what + * is effectively a lock inversion, we'll go to the + * trouble of taking them in that same order... otherwise + * if this object contains the majority of the pages resident + * in the UBC (or a small set of large objects actively being + * worked on contain the majority of the pages), we could + * cause the pageout_scan thread to 'starve' in its attempt + * to find pages to move to the free queue, since it has to + * successfully acquire the object lock of any candidate page + * before it can steal/clean it. + */ + for (j = 0; ; j++) { + vm_page_lock_queues(); + + if (vm_object_lock_try(shadow_object)) + break; + vm_page_unlock_queues(); + mutex_pause(j); + } + delayed_unlock = 1; + while (xfer_size) { vm_page_t t, m; - if (delayed_unlock == 0) - vm_page_lock_queues(); - m = VM_PAGE_NULL; if (upl->flags & UPL_LITE) { @@ -3937,6 +4044,17 @@ upl_commit_range( m->dirty = TRUE; else if (flags & UPL_COMMIT_CLEAR_DIRTY) { m->dirty = FALSE; + if (m->cs_validated && !m->cs_tainted) { + /* + * CODE SIGNING: + * This page is no longer dirty + * but could have been modified, + * so it will need to be + * re-validated. + */ + m->cs_validated = FALSE; + vm_cs_validated_resets++; + } clear_refmod |= VM_MEM_MODIFIED; } if (flags & UPL_COMMIT_INACTIVATE) @@ -3964,6 +4082,17 @@ upl_commit_range( */ if (flags & UPL_COMMIT_CLEAR_DIRTY) { m->dirty = FALSE; + if (m->cs_validated && !m->cs_tainted) { + /* + * CODE SIGNING: + * This page is no longer dirty + * but could have been modified, + * so it will need to be + * re-validated. + */ + m->cs_validated = FALSE; + vm_cs_validated_resets++; + } clear_refmod |= VM_MEM_MODIFIED; } if (clear_refmod) @@ -4003,6 +4132,17 @@ upl_commit_range( if (m->wanted) vm_pageout_target_collisions++; #endif m->dirty = FALSE; + if (m->cs_validated && !m->cs_tainted) { + /* + * CODE SIGNING: + * This page is no longer dirty + * but could have been modified, + * so it will need to be + * re-validated. + */ + m->cs_validated = FALSE; + vm_cs_validated_resets++; + } if (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)) m->dirty = TRUE; @@ -4049,7 +4189,7 @@ upl_commit_range( goto commit_next_page; } #if MACH_CLUSTER_STATS - if (m->pmapped) + if (m->wpmapped) m->dirty = pmap_is_modified(m->phys_page); if (m->dirty) vm_pageout_cluster_dirtied++; @@ -4057,6 +4197,17 @@ upl_commit_range( if (m->wanted) vm_pageout_cluster_collisions++; #endif m->dirty = FALSE; + if (m->cs_validated && !m->cs_tainted) { + /* + * CODE SIGNING: + * This page is no longer dirty + * but could have been modified, + * so it will need to be + * re-validated. + */ + m->cs_validated = FALSE; + vm_cs_validated_resets++; + } if ((m->busy) && (m->cleaning)) { /* @@ -4122,7 +4273,29 @@ commit_next_page: entry++; if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) { + /* + * pageout_scan takes the vm_page_lock_queues first + * then tries for the object lock... to avoid what + * is effectively a lock inversion, we'll go to the + * trouble of taking them in that same order... otherwise + * if this object contains the majority of the pages resident + * in the UBC (or a small set of large objects actively being + * worked on contain the majority of the pages), we could + * cause the pageout_scan thread to 'starve' in its attempt + * to find pages to move to the free queue, since it has to + * successfully acquire the object lock of any candidate page + * before it can steal/clean it. + */ + vm_object_unlock(shadow_object); mutex_yield(&vm_page_queue_lock); + + for (j = 0; ; j++) { + if (vm_object_lock_try(shadow_object)) + break; + vm_page_unlock_queues(); + mutex_pause(j); + vm_page_lock_queues(); + } delayed_unlock = 1; } } @@ -4199,6 +4372,7 @@ upl_abort_range( wpl_array_t lite_list; int occupied; int delayed_unlock = 0; + int j; *empty = FALSE; @@ -4233,17 +4407,35 @@ upl_abort_range( } else shadow_object = object; - vm_object_lock(shadow_object); - entry = offset/PAGE_SIZE; target_offset = (vm_object_offset_t)offset; + /* + * pageout_scan takes the vm_page_lock_queues first + * then tries for the object lock... to avoid what + * is effectively a lock inversion, we'll go to the + * trouble of taking them in that same order... otherwise + * if this object contains the majority of the pages resident + * in the UBC (or a small set of large objects actively being + * worked on contain the majority of the pages), we could + * cause the pageout_scan thread to 'starve' in its attempt + * to find pages to move to the free queue, since it has to + * successfully acquire the object lock of any candidate page + * before it can steal/clean it. + */ + for (j = 0; ; j++) { + vm_page_lock_queues(); + + if (vm_object_lock_try(shadow_object)) + break; + vm_page_unlock_queues(); + mutex_pause(j); + } + delayed_unlock = 1; + while (xfer_size) { vm_page_t t, m; - if (delayed_unlock == 0) - vm_page_lock_queues(); - m = VM_PAGE_NULL; if (upl->flags & UPL_LITE) { @@ -4352,7 +4544,29 @@ upl_abort_range( } } if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) { + /* + * pageout_scan takes the vm_page_lock_queues first + * then tries for the object lock... to avoid what + * is effectively a lock inversion, we'll go to the + * trouble of taking them in that same order... otherwise + * if this object contains the majority of the pages resident + * in the UBC (or a small set of large objects actively being + * worked on contain the majority of the pages), we could + * cause the pageout_scan thread to 'starve' in its attempt + * to find pages to move to the free queue, since it has to + * successfully acquire the object lock of any candidate page + * before it can steal/clean it. + */ + vm_object_unlock(shadow_object); mutex_yield(&vm_page_queue_lock); + + for (j = 0; ; j++) { + if (vm_object_lock_try(shadow_object)) + break; + vm_page_unlock_queues(); + mutex_pause(j); + vm_page_lock_queues(); + } delayed_unlock = 1; } target_offset += PAGE_SIZE_64; @@ -5230,6 +5444,7 @@ vm_paging_map_object( pmap_sync_page_data_phys(page->phys_page); } page->pmapped = TRUE; + page->wpmapped = TRUE; cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK; //assert(pmap_verify_free(page->phys_page)); @@ -5656,6 +5871,17 @@ vm_page_decrypt( * and the decryption doesn't count. */ page->dirty = FALSE; + if (page->cs_validated && !page->cs_tainted) { + /* + * CODE SIGNING: + * This page is no longer dirty + * but could have been modified, + * so it will need to be + * re-validated. + */ + page->cs_validated = FALSE; + vm_cs_validated_resets++; + } pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED); page->encrypted = FALSE; @@ -5676,6 +5902,7 @@ vm_page_decrypt( */ assert(pmap_verify_free(page->phys_page)); page->pmapped = FALSE; + page->wpmapped = FALSE; vm_object_paging_end(page->object); } diff --git a/osfmk/vm/vm_purgeable.c b/osfmk/vm/vm_purgeable.c index df1f3f810..dfd80266f 100644 --- a/osfmk/vm/vm_purgeable.c +++ b/osfmk/vm/vm_purgeable.c @@ -33,9 +33,9 @@ struct token { struct token tokens[MAX_VOLATILE]; -token_idx_t token_free_idx = 0; /* head of free queue */ -token_cnt_t token_init_count = 1; /* token 0 is reserved!! */ -token_cnt_t token_new_pagecount = 0; /* count of pages that will +token_idx_t token_free_idx = 0; /* head of free queue */ +token_idx_t token_init_idx = 1; /* token 0 is reserved!! */ +int32_t token_new_pagecount = 0; /* count of pages that will * be added onto token queue */ int available_for_purge = 0; /* increase when ripe token @@ -96,9 +96,9 @@ vm_purgeable_token_add(purgeable_q_t queue) token_idx_t token; enum purgeable_q_type i; - if (token_init_count < MAX_VOLATILE) { /* lazy token array init */ - token = token_init_count; - token_init_count++; + if (token_init_idx < MAX_VOLATILE) { /* lazy token array init */ + token = token_init_idx; + token_init_idx++; } else if (token_free_idx) { token = token_free_idx; token_free_idx = tokens[token_free_idx].next; @@ -111,9 +111,10 @@ vm_purgeable_token_add(purgeable_q_t queue) * obsolete */ for (i = PURGEABLE_Q_TYPE_FIFO; i < PURGEABLE_Q_TYPE_MAX; i++) { - purgeable_queues[i].new_pages += token_new_pagecount; - assert(purgeable_queues[i].new_pages >= 0); - assert((uint64_t) (purgeable_queues[i].new_pages) <= TOKEN_COUNT_MAX); + int64_t pages = purgeable_queues[i].new_pages += token_new_pagecount; + assert(pages >= 0); + assert(pages <= TOKEN_COUNT_MAX); + purgeable_queues[i].new_pages=pages; } token_new_pagecount = 0; @@ -235,6 +236,20 @@ vm_purgeable_token_delete_first(purgeable_q_t queue) void vm_purgeable_q_advance_all(uint32_t num_pages) { + /* check queue counters - if they get really large, scale them back. + * They tend to get that large when there is no purgeable queue action */ + int i; + if(token_new_pagecount > (INT32_MAX >> 1)) /* a system idling years might get there */ + { + for (i = PURGEABLE_Q_TYPE_FIFO; i < PURGEABLE_Q_TYPE_MAX; i++) { + int64_t pages = purgeable_queues[i].new_pages += token_new_pagecount; + assert(pages >= 0); + assert(pages <= TOKEN_COUNT_MAX); + purgeable_queues[i].new_pages=pages; + } + token_new_pagecount = 0; + } + /* * don't need to advance obsolete queue - all items are ripe there, * always diff --git a/osfmk/vm/vm_purgeable_internal.h b/osfmk/vm/vm_purgeable_internal.h index ab2db597e..e225da651 100644 --- a/osfmk/vm/vm_purgeable_internal.h +++ b/osfmk/vm/vm_purgeable_internal.h @@ -46,12 +46,7 @@ enum purgeable_q_type { PURGEABLE_Q_TYPE_MAX }; -/* - * It appears there's a 16 vs 32 size mismatch when using - * CONFIG_TOKEN_QUEUE_SMALL and the resulting math can lead to a large - * negative value for new_pages in vm_purgeable.c. - */ -#if (CONFIG_TOKEN_QUEUE_SMALL == 1) && 0 +#if (CONFIG_TOKEN_QUEUE_SMALL == 1) typedef uint16_t token_idx_t; typedef uint16_t token_cnt_t; #define MAX_VOLATILE 0x01000 @@ -80,7 +75,7 @@ struct purgeable_q { typedef struct purgeable_q * purgeable_q_t; extern struct purgeable_q purgeable_queues[PURGEABLE_Q_TYPE_MAX]; -extern token_cnt_t token_new_pagecount; +extern int32_t token_new_pagecount; extern int available_for_purge; diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index 5d4d80b47..f50356d0d 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -100,8 +100,6 @@ int speculative_steal_index = 0; struct vm_speculative_age_q vm_page_queue_speculative[VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1]; -static void vm_page_insert_internal(vm_page_t, vm_object_t, vm_object_offset_t, boolean_t); - /* * Associated with page of user-allocatable memory is a @@ -406,6 +404,7 @@ vm_page_bootstrap( m->laundry = FALSE; m->free = FALSE; m->pmapped = FALSE; + m->wpmapped = FALSE; m->reference = FALSE; m->pageout = FALSE; m->dump_cleaning = FALSE; @@ -889,7 +888,7 @@ vm_page_insert( } -static void +void vm_page_insert_internal( vm_page_t mem, vm_object_t object, @@ -1546,6 +1545,7 @@ vm_page_grablo(void) assert(mem->free); assert(mem->busy); assert(!mem->pmapped); + assert(!mem->wpmapped); mem->pageq.next = NULL; mem->pageq.prev = NULL; @@ -1613,6 +1613,7 @@ return_page_from_cpu_list: assert(mem->busy); assert(!mem->encrypted); assert(!mem->pmapped); + assert(!mem->wpmapped); return mem; } @@ -1723,6 +1724,7 @@ return_page_from_cpu_list: assert(!mem->free); assert(!mem->encrypted); assert(!mem->pmapped); + assert(!mem->wpmapped); } PROCESSOR_DATA(current_processor(), free_pages) = head->pageq.next; PROCESSOR_DATA(current_processor(), start_color) = color; @@ -2090,6 +2092,7 @@ vm_page_free_prepare( mem->encrypted_cleaning = FALSE; mem->deactivated = FALSE; mem->pmapped = FALSE; + mem->wpmapped = FALSE; if (mem->private) { mem->private = FALSE; @@ -2805,11 +2808,9 @@ vm_page_copy( dest_m->encrypted = FALSE; if (src_m->object != VM_OBJECT_NULL && - src_m->object->code_signed && - !src_m->cs_validated) { + src_m->object->code_signed) { /* - * We're copying a not-yet-validated page from a - * code-signed object. + * We're copying a page from a code-signed object. * Whoever ends up mapping the copy page might care about * the original page's integrity, so let's validate the * source page now. diff --git a/osfmk/vm/vm_shared_region.c b/osfmk/vm/vm_shared_region.c index 50632a9a0..f6975e1c1 100644 --- a/osfmk/vm/vm_shared_region.c +++ b/osfmk/vm/vm_shared_region.c @@ -101,6 +101,8 @@ #include #include +#include + #include #include @@ -770,6 +772,9 @@ vm_shared_region_map_file( unsigned int i; mach_port_t map_port; mach_vm_offset_t target_address; + vm_object_t object; + vm_object_size_t obj_size; + kr = KERN_SUCCESS; @@ -844,51 +849,143 @@ vm_shared_region_map_file( target_address = mappings[i].sfm_address - sr_base_address; - /* establish that mapping, OK if it's to "already" there */ - kr = vm_map_enter_mem_object( - sr_map, - &target_address, - vm_map_round_page(mappings[i].sfm_size), - 0, - VM_FLAGS_FIXED | VM_FLAGS_ALREADY, - map_port, - mappings[i].sfm_file_offset, - TRUE, - mappings[i].sfm_init_prot & VM_PROT_ALL, - mappings[i].sfm_max_prot & VM_PROT_ALL, - VM_INHERIT_DEFAULT); - if (kr == KERN_MEMORY_PRESENT) { - /* this exact mapping was already there: that's fine */ - SHARED_REGION_TRACE_INFO( - ("shared_region: mapping[%d]: " - "address:0x%016llx size:0x%016llx " - "offset:0x%016llx " - "maxprot:0x%x prot:0x%x already mapped...\n", - i, - (long long)mappings[i].sfm_address, - (long long)mappings[i].sfm_size, - (long long)mappings[i].sfm_file_offset, - mappings[i].sfm_max_prot, - mappings[i].sfm_init_prot)); - kr = KERN_SUCCESS; - } else if (kr != KERN_SUCCESS) { - /* this mapping failed ! */ - SHARED_REGION_TRACE_ERROR( - ("shared_region: mapping[%d]: " - "address:0x%016llx size:0x%016llx " - "offset:0x%016llx " - "maxprot:0x%x prot:0x%x failed 0x%x\n", - i, - (long long)mappings[i].sfm_address, - (long long)mappings[i].sfm_size, - (long long)mappings[i].sfm_file_offset, - mappings[i].sfm_max_prot, - mappings[i].sfm_init_prot, - kr)); - break; + /* establish that mapping, OK if it's "already" there */ + if (map_port == MACH_PORT_NULL) { + /* + * We want to map some anonymous memory in a + * shared region. + * We have to create the VM object now, so that it + * can be mapped "copy-on-write". + */ + obj_size = vm_map_round_page(mappings[i].sfm_size); + object = vm_object_allocate(obj_size); + if (object == VM_OBJECT_NULL) { + kr = KERN_RESOURCE_SHORTAGE; + } else { + kr = vm_map_enter( + sr_map, + &target_address, + vm_map_round_page(mappings[i].sfm_size), + 0, + VM_FLAGS_FIXED | VM_FLAGS_ALREADY, + object, + 0, + TRUE, + mappings[i].sfm_init_prot & VM_PROT_ALL, + mappings[i].sfm_max_prot & VM_PROT_ALL, + VM_INHERIT_DEFAULT); + } + } else { + object = VM_OBJECT_NULL; /* no anonymous memory here */ + kr = vm_map_enter_mem_object( + sr_map, + &target_address, + vm_map_round_page(mappings[i].sfm_size), + 0, + VM_FLAGS_FIXED | VM_FLAGS_ALREADY, + map_port, + mappings[i].sfm_file_offset, + TRUE, + mappings[i].sfm_init_prot & VM_PROT_ALL, + mappings[i].sfm_max_prot & VM_PROT_ALL, + VM_INHERIT_DEFAULT); } - /* we're protected by "sr_mapping_in_progress" */ + if (kr != KERN_SUCCESS) { + if (map_port == MACH_PORT_NULL) { + /* + * Get rid of the VM object we just created + * but failed to map. + */ + vm_object_deallocate(object); + object = VM_OBJECT_NULL; + } + if (kr == KERN_MEMORY_PRESENT) { + /* + * This exact mapping was already there: + * that's fine. + */ + SHARED_REGION_TRACE_INFO( + ("shared_region: mapping[%d]: " + "address:0x%016llx size:0x%016llx " + "offset:0x%016llx " + "maxprot:0x%x prot:0x%x " + "already mapped...\n", + i, + (long long)mappings[i].sfm_address, + (long long)mappings[i].sfm_size, + (long long)mappings[i].sfm_file_offset, + mappings[i].sfm_max_prot, + mappings[i].sfm_init_prot)); + /* + * We didn't establish this mapping ourselves; + * let's reset its size, so that we do not + * attempt to undo it if an error occurs later. + */ + mappings[i].sfm_size = 0; + kr = KERN_SUCCESS; + } else { + unsigned int j; + + /* this mapping failed ! */ + SHARED_REGION_TRACE_ERROR( + ("shared_region: mapping[%d]: " + "address:0x%016llx size:0x%016llx " + "offset:0x%016llx " + "maxprot:0x%x prot:0x%x failed 0x%x\n", + i, + (long long)mappings[i].sfm_address, + (long long)mappings[i].sfm_size, + (long long)mappings[i].sfm_file_offset, + mappings[i].sfm_max_prot, + mappings[i].sfm_init_prot, + kr)); + + /* + * Undo the mappings we've established so far. + */ + for (j = 0; j < i; j++) { + kern_return_t kr2; + + if (mappings[j].sfm_size == 0) { + /* + * We didn't establish this + * mapping, so nothing to undo. + */ + continue; + } + SHARED_REGION_TRACE_INFO( + ("shared_region: mapping[%d]: " + "address:0x%016llx " + "size:0x%016llx " + "offset:0x%016llx " + "maxprot:0x%x prot:0x%x: " + "undoing...\n", + j, + (long long)mappings[j].sfm_address, + (long long)mappings[j].sfm_size, + (long long)mappings[j].sfm_file_offset, + mappings[j].sfm_max_prot, + mappings[j].sfm_init_prot)); + kr2 = mach_vm_deallocate( + sr_map, + (mappings[j].sfm_address - + sr_base_address), + mappings[j].sfm_size); + assert(kr2 == KERN_SUCCESS); + } + + break; + } + + } + + /* + * Record the first (chronologically) mapping in + * this shared region. + * We're protected by "sr_mapping_in_progress" here, + * so no need to lock "shared_region". + */ if (shared_region->sr_first_mapping == (mach_vm_offset_t) -1) { shared_region->sr_first_mapping = target_address; } diff --git a/security/conf/MASTER b/security/conf/MASTER index d3fcf6d1f..d692d5ae5 100644 --- a/security/conf/MASTER +++ b/security/conf/MASTER @@ -55,7 +55,7 @@ ident SECURITY # Note: MAC options must be set in both bsd/conf and security/conf MASTER files # options KDEBUG # kernel tracing # -options AUDIT # Security event auditing +options AUDIT # Security event auditing # options CONFIG_LCTX # Login Context options CONFIG_DTRACE # dtrace support # diff --git a/security/conf/MASTER.i386 b/security/conf/MASTER.i386 index 01b3a55d2..1bd463765 100644 --- a/security/conf/MASTER.i386 +++ b/security/conf/MASTER.i386 @@ -1,16 +1,17 @@ ###################################################################### # -# RELEASE = [ intel mach libkerncpp config_dtrace ] +# RELEASE = [ intel mach libkerncpp config_dtrace audit ] # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # -# EMBEDDED = [ intel mach libkerncpp ] +# EMBEDDED = [ intel mach libkerncpp audit ] # DEVELOPMENT = [ EMBEDDED config_dtrace ] # ###################################################################### # -# Note: MAC options must be set in both bsd/conf and security/conf MASTER files +# Note: MAC options must be set in all the bsd/conf, osfmk/conf, and +# security/conf MASTER files. # options CONFIG_MACF # Mandatory Access Control Framework options CONFIG_MACF_SOCKET_SUBSET # MACF subset of socket support diff --git a/security/conf/MASTER.ppc b/security/conf/MASTER.ppc index 177301b38..534e8d2fc 100644 --- a/security/conf/MASTER.ppc +++ b/security/conf/MASTER.ppc @@ -4,7 +4,7 @@ # Standard Apple MacOS X Configurations: # -------- ---- -------- --------------- # -# RELEASE = [ppc mach libkerncpp config_dtrace] +# RELEASE = [ppc mach libkerncpp config_dtrace audit] # DEVELOPMENT = [RELEASE] # PROFILE = [RELEASE] # DEBUG = [RELEASE debug] @@ -14,8 +14,8 @@ ###################################################################### # -# Note: corresponding MACF options must be set in both security/conf -# bsd/conf and/or osfmk/conf MASTER files (depending upon the option) +# Note: MAC options must be set in all the bsd/conf, osfmk/conf, and +# security/conf MASTER files. # options CONFIG_MACF # Mandatory Access Control Framework options CONFIG_MACF_SOCKET_SUBSET # MACF subset of socket support diff --git a/security/conf/Makefile.template b/security/conf/Makefile.template index b5a57e158..f697e624e 100644 --- a/security/conf/Makefile.template +++ b/security/conf/Makefile.template @@ -26,7 +26,7 @@ include $(MakeInc_def) # # XXX: CFLAGS # -CFLAGS+= -DKERNEL -DBSD_KERNEL_PRIVATE \ +CFLAGS+= -I. -imacros meta_features.h -DKERNEL -DBSD_KERNEL_PRIVATE \ -Wall -Wno-four-char-constants -fno-common # diff --git a/security/conf/files b/security/conf/files index c0565103d..bea378a45 100644 --- a/security/conf/files +++ b/security/conf/files @@ -1,6 +1,12 @@ # options # OPTIONS/kdebug optional kdebug +OPTIONS/audit optional audit +OPTIONS/config_macf optional config_macf +OPTIONS/config_macf_socket_subset optional config_macf_socket_subset +OPTIONS/config_macf_socket optional config_macf_socket +OPTIONS/config_macf_net optional config_macf_net + # security security/mac_alloc.c optional config_macf diff --git a/security/mac_audit.c b/security/mac_audit.c index cb61c1912..286b6ad5a 100644 --- a/security/mac_audit.c +++ b/security/mac_audit.c @@ -74,7 +74,7 @@ #include #include -#ifdef AUDIT +#if AUDIT /* The zone allocator is initialized in mac_base.c. */ zone_t mac_audit_data_zone; @@ -395,4 +395,10 @@ mac_audit(int len, u_char *data) return (0); } + +int +mac_audit_text(__unused char *text, __unused mac_policy_handle_t handle) +{ + return (0); +} #endif /* !AUDIT */ diff --git a/security/mac_base.c b/security/mac_base.c index 37c9d05af..b65948131 100644 --- a/security/mac_base.c +++ b/security/mac_base.c @@ -248,12 +248,14 @@ SYSCTL_UINT(_security_mac, OID_AUTO, label_mbufs, CTLFLAG_RW, &mac_label_mbufs, 0, "Label all MBUFs"); #endif +#if AUDIT /* * mac_audit_data_zone is the zone used for data pushed into the audit * record by policies. Using a zone simplifies memory management of this * data, and allows tracking of the amount of data in flight. */ extern zone_t mac_audit_data_zone; +#endif /* * mac_policy_list holds the list of policy modules. Modules with a @@ -540,9 +542,11 @@ mac_policy_initbsd(void) struct mac_policy_conf *mpc; u_int i; +#if AUDIT mac_audit_data_zone = zinit(MAC_AUDIT_DATA_LIMIT, AQ_HIWATER * MAC_AUDIT_DATA_LIMIT, 8192, "mac_audit_data_zone"); +#endif printf("MAC Framework successfully initialized\n"); diff --git a/tools/tests/xnu_quick_test/tests.c b/tools/tests/xnu_quick_test/tests.c index ad0db0e63..4bd3f0e4a 100644 --- a/tools/tests/xnu_quick_test/tests.c +++ b/tools/tests/xnu_quick_test/tests.c @@ -4577,7 +4577,7 @@ int aio_tests( void * the_argp ) my_aiocbp = &my_aiocbs[ 0 ]; my_aiocbp->aio_fildes = my_fd_list[ 0 ]; - my_aiocbp->aio_offset = 0; + my_aiocbp->aio_offset = 4096; my_aiocbp->aio_buf = my_buffers[ 0 ]; my_aiocbp->aio_nbytes = AIO_TESTS_BUFFER_SIZE; my_aiocbp->aio_reqprio = 0; -- 2.47.2